[spark] branch master updated: [SPARK-44357][PYTHON] Add pyspark_testing module for GHA tests

2023-07-10 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new c3bfc345191 [SPARK-44357][PYTHON] Add pyspark_testing module for GHA 
tests
c3bfc345191 is described below

commit c3bfc345191d645769d943127eccae91b23390a5
Author: Amanda Liu 
AuthorDate: Tue Jul 11 12:51:16 2023 +0900

[SPARK-44357][PYTHON] Add pyspark_testing module for GHA tests

### What changes were proposed in this pull request?
This PR adds a new module and modifies GHA for `pyspark.testing.utils` 
doctests.

### Why are the changes needed?
This change ensures that doctests are run by GHA, since 
`pyspark.testing.utils` now contains user-facing APIs with docstrings

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing unit tests

Closes #41896 from asl3/add-pyspark-testing-module.

Authored-by: Amanda Liu 
Signed-off-by: Hyukjin Kwon 
---
 .github/workflows/build_and_test.yml |  2 +-
 dev/sparktestsupport/modules.py  | 11 ++-
 dev/sparktestsupport/utils.py|  8 
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml 
b/.github/workflows/build_and_test.yml
index 23d5a94c320..0b184c6c248 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -345,7 +345,7 @@ jobs:
   - ${{ inputs.java }}
 modules:
   - >-
-pyspark-sql, pyspark-mllib, pyspark-resource
+pyspark-sql, pyspark-mllib, pyspark-resource, pyspark-testing
   - >-
 pyspark-core, pyspark-streaming, pyspark-ml
   - >-
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 2090546512f..72a5a6f6394 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -468,7 +468,6 @@ pyspark_sql = Module(
 "pyspark.sql.pandas.typehints",
 "pyspark.sql.pandas.utils",
 "pyspark.sql.observation",
-"pyspark.testing.utils",
 # unittests
 "pyspark.sql.tests.test_arrow",
 "pyspark.sql.tests.test_arrow_python_udf",
@@ -508,6 +507,16 @@ pyspark_sql = Module(
 ],
 )
 
+pyspark_testing = Module(
+name="pyspark-testing",
+dependencies=[pyspark_core, pyspark_sql],
+source_file_regexes=["python/pyspark/testing"],
+python_test_goals=[
+# doctests
+"pyspark.testing.utils",
+],
+)
+
 pyspark_resource = Module(
 name="pyspark-resource",
 dependencies=[pyspark_core],
diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
index d07fc936f8f..339534bec25 100755
--- a/dev/sparktestsupport/utils.py
+++ b/dev/sparktestsupport/utils.py
@@ -114,14 +114,14 @@ def determine_modules_to_test(changed_modules, 
deduplicated=True):
 ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 
'hive-thriftserver',
  'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 
'pyspark-pandas',
  'pyspark-pandas-connect', 'pyspark-pandas-slow', 
'pyspark-pandas-slow-connect', 'pyspark-sql',
- 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+ 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
 >>> sorted([x.name for x in determine_modules_to_test(
 ... [modules.sparkr, modules.sql], deduplicated=False)])
 ... # doctest: +NORMALIZE_WHITESPACE
 ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 
'hive-thriftserver',
  'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 
'pyspark-pandas',
  'pyspark-pandas-connect', 'pyspark-pandas-slow', 
'pyspark-pandas-slow-connect', 'pyspark-sql',
- 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+ 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
 >>> sorted([x.name for x in determine_modules_to_test(
 ... [modules.sql, modules.core], deduplicated=False)])
 ... # doctest: +NORMALIZE_WHITESPACE
@@ -129,8 +129,8 @@ def determine_modules_to_test(changed_modules, 
deduplicated=True):
  'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 
'pyspark-connect',
  'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 
'pyspark-pandas-connect',
  'pyspark-pandas-slow', 'pyspark-pandas-slow-connect', 'pyspark-resource', 
'pyspark-sql',
- 'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', 
'streaming',
- 'streaming-kafka-0-10', 'streaming-kinesis-asl']
+ 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 
'sql-kafka-0-10',
+ 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
 """
 modules_to_test = set()
 for module in changed_modules:



[spark] branch master updated: [SPARK-44363][PYTHON] Display percent of unequal rows in DataFrame comparison

2023-07-10 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9accb5c539f [SPARK-44363][PYTHON] Display percent of unequal rows in 
DataFrame comparison
9accb5c539f is described below

commit 9accb5c539f6783c3e9e0147f2199ea370af26c4
Author: Amanda Liu 
AuthorDate: Tue Jul 11 12:49:10 2023 +0900

[SPARK-44363][PYTHON] Display percent of unequal rows in DataFrame 
comparison

### What changes were proposed in this pull request?
This PR fixes error message display of percent of unequal rows for unequal 
DataFrames, in the `assertDataFrameEqual` util function.

### Why are the changes needed?
The correction is needed to provide accurate error message output.

### Does this PR introduce _any_ user-facing change?
Yes, the PR modifies user-facing error message for the 
`assertDataFrameEqual` util function.

### How was this patch tested?
Modified existing tests in `runtime/python/pyspark/sql/tests/test_utils.py` 
and `runtime/python/pyspark/sql/tests/connect/test_utils.py`

Closes #41926 from asl3/fix-percent-diff.

Authored-by: Amanda Liu 
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/tests/test_utils.py | 8 
 python/pyspark/testing/utils.py| 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/tests/test_utils.py 
b/python/pyspark/sql/tests/test_utils.py
index fa64858..1757d8dd2e1 100644
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@@ -142,7 +142,7 @@ class UtilsTestsMixin:
 )
 
 expected_error_message = "Results do not match: "
-percent_diff = 1 / 2
+percent_diff = (1 / 2) * 100
 expected_error_message += "( %.5f %% )" % percent_diff
 diff_msg = (
 "[df]"
@@ -457,7 +457,7 @@ class UtilsTestsMixin:
 )
 
 expected_error_message = "Results do not match: "
-percent_diff = 1 / 2
+percent_diff = (1 / 2) * 100
 expected_error_message += "( %.5f %% )" % percent_diff
 diff_msg = (
 "[df]"
@@ -553,7 +553,7 @@ class UtilsTestsMixin:
 )
 
 expected_error_message = "Results do not match: "
-percent_diff = 2 / 2
+percent_diff = (2 / 2) * 100
 expected_error_message += "( %.5f %% )" % percent_diff
 diff_msg = (
 "[df]"
@@ -641,7 +641,7 @@ class UtilsTestsMixin:
 )
 
 expected_error_message = "Results do not match: "
-percent_diff = 2 / 3
+percent_diff = (2 / 3) * 100
 expected_error_message += "( %.5f %% )" % percent_diff
 diff_msg = (
 "[df]"
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index c6ec6adc8af..651d57bb11d 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -239,7 +239,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
 The expected result of the operation, for comparison with the actual 
result.
 
 check_row_order : bool, optional
-A flag indicates whether the order of rows should be considered in the 
comparison.
+A flag indicating whether the order of rows should be considered in 
the comparison.
 If set to `False` (default), the row order is not taken into account.
 If set to `True`, the order of rows is important and will be checked 
during comparison.
 
@@ -258,7 +258,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
 >>> assertDataFrameEqual(df1, df2) # fail  # doctest: 
+IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
 ...
-PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 0.7 % )
+PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.667 % )
 [df]
 Row(id='1', amount=1000.0)
 
@@ -370,7 +370,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
 diff_msg += "" + "\n\n"
 
 if not rows_equal:
-percent_diff = diff_rows_cnt / len(zipped)
+percent_diff = (diff_rows_cnt / len(zipped)) * 100
 error_msg += "( %.5f %% )" % percent_diff
 error_msg += "\n" + diff_msg
 raise PySparkAssertionError(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [SPARK-44251][SQL] Set nullable correctly on coalesced join key in full outer USING join

2023-07-10 Thread yumwang
This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 073d0b60d31 [SPARK-44251][SQL] Set nullable correctly on coalesced 
join key in full outer USING join
073d0b60d31 is described below

commit 073d0b60d31bf68ebacdc005f59b928a5902670f
Author: Bruce Robbins 
AuthorDate: Tue Jul 11 11:18:44 2023 +0800

[SPARK-44251][SQL] Set nullable correctly on coalesced join key in full 
outer USING join

### What changes were proposed in this pull request?

For full outer joins employing USING, set the nullability of the coalesced 
join columns to true.

### Why are the changes needed?

The following query produces incorrect results:
```
create or replace temp view v1 as values (1, 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values (2, 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

-1   <== should be null
1
2
```
The following query fails with a `NullPointerException`:
```
create or replace temp view v1 as values ('1', 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values ('2', 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

23/06/25 17:06:39 ERROR Executor: Exception in task 0.0 in stage 14.0 (TID 
11)
java.lang.NullPointerException
at 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.generate_doConsume_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.smj_consumeFullOuterJoinRow_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.wholestagecodegen_findNextJoinRows_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown
 Source)
at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
...
```
The above full outer joins implicitly add an aliased coalesce to the parent 
projection of the join: `coalesce(v1.c1, v2.c1) as c1`. In the case where only 
one side's key is nullable, the coalesce's nullability is false. As a result, 
the generator's output has nullable set as false. But this is incorrect: If one 
side has a row with explicit null key values, the other side's row will also 
have null key values (because the other side's row will be "made up"), and both 
the `coalesce` and  [...]

While `UpdateNullability` actually repairs the nullability of the 
`coalesce` before execution, it doesn't recreate the generator output, so the 
nullability remains incorrect in `Generate#output`.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

Closes #41809 from bersprockets/using_oddity2.

Authored-by: Bruce Robbins 
Signed-off-by: Yuming Wang 
(cherry picked from commit 7a27bc68c849041837e521285e33227c3d1f9853)
Signed-off-by: Yuming Wang 
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  | 10 --
 .../test/scala/org/apache/spark/sql/JoinSuite.scala| 18 ++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index b5242f631bd..d7bba23cf68 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3465,8 +3465,14 @@ class Analyzer(override val catalogManager: 
CatalogManager)
 (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ 
rUniqueOutput,
   leftKeys.map(_.withNullability(true)))
   case FullOuter =>
-// in full outer join, joinCols should be non-null if there is.
-val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, 
r)), l.name)() }
+// In full outer join, we should return non-null values for the join 
columns
+// if either side has non-null values for those columns. Therefore, 
for each
+// join column pair, add a coalesce to return the non-null value, if 
it exists.
+val joinedCols = 

[spark] branch branch-3.4 updated: [SPARK-44251][SQL] Set nullable correctly on coalesced join key in full outer USING join

2023-07-10 Thread yumwang
This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
 new 8c690c412dd [SPARK-44251][SQL] Set nullable correctly on coalesced 
join key in full outer USING join
8c690c412dd is described below

commit 8c690c412dd30a03ca932547ded01104f37f9965
Author: Bruce Robbins 
AuthorDate: Tue Jul 11 11:18:44 2023 +0800

[SPARK-44251][SQL] Set nullable correctly on coalesced join key in full 
outer USING join

### What changes were proposed in this pull request?

For full outer joins employing USING, set the nullability of the coalesced 
join columns to true.

### Why are the changes needed?

The following query produces incorrect results:
```
create or replace temp view v1 as values (1, 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values (2, 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

-1   <== should be null
1
2
```
The following query fails with a `NullPointerException`:
```
create or replace temp view v1 as values ('1', 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values ('2', 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

23/06/25 17:06:39 ERROR Executor: Exception in task 0.0 in stage 14.0 (TID 
11)
java.lang.NullPointerException
at 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.generate_doConsume_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.smj_consumeFullOuterJoinRow_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.wholestagecodegen_findNextJoinRows_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown
 Source)
at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
...
```
The above full outer joins implicitly add an aliased coalesce to the parent 
projection of the join: `coalesce(v1.c1, v2.c1) as c1`. In the case where only 
one side's key is nullable, the coalesce's nullability is false. As a result, 
the generator's output has nullable set as false. But this is incorrect: If one 
side has a row with explicit null key values, the other side's row will also 
have null key values (because the other side's row will be "made up"), and both 
the `coalesce` and  [...]

While `UpdateNullability` actually repairs the nullability of the 
`coalesce` before execution, it doesn't recreate the generator output, so the 
nullability remains incorrect in `Generate#output`.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

Closes #41809 from bersprockets/using_oddity2.

Authored-by: Bruce Robbins 
Signed-off-by: Yuming Wang 
(cherry picked from commit 7a27bc68c849041837e521285e33227c3d1f9853)
Signed-off-by: Yuming Wang 
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  | 10 --
 .../test/scala/org/apache/spark/sql/JoinSuite.scala| 18 ++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index c7455a4a8f2..b7d174089bc 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3469,8 +3469,14 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
 (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ 
rUniqueOutput,
   leftKeys.map(_.withNullability(true)))
   case FullOuter =>
-// in full outer join, joinCols should be non-null if there is.
-val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, 
r)), l.name)() }
+// In full outer join, we should return non-null values for the join 
columns
+// if either side has non-null values for those columns. Therefore, 
for each
+// join column pair, add a coalesce to return the non-null value, if 
it exists.
+val 

[spark] branch master updated: [SPARK-44251][SQL] Set nullable correctly on coalesced join key in full outer USING join

2023-07-10 Thread yumwang
This is an automated email from the ASF dual-hosted git repository.

yumwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 7a27bc68c84 [SPARK-44251][SQL] Set nullable correctly on coalesced 
join key in full outer USING join
7a27bc68c84 is described below

commit 7a27bc68c849041837e521285e33227c3d1f9853
Author: Bruce Robbins 
AuthorDate: Tue Jul 11 11:18:44 2023 +0800

[SPARK-44251][SQL] Set nullable correctly on coalesced join key in full 
outer USING join

### What changes were proposed in this pull request?

For full outer joins employing USING, set the nullability of the coalesced 
join columns to true.

### Why are the changes needed?

The following query produces incorrect results:
```
create or replace temp view v1 as values (1, 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values (2, 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

-1   <== should be null
1
2
```
The following query fails with a `NullPointerException`:
```
create or replace temp view v1 as values ('1', 2), (null, 7) as (c1, c2);
create or replace temp view v2 as values ('2', 3) as (c1, c2);

select explode(array(c1)) as x
from v1
full outer join v2
using (c1);

23/06/25 17:06:39 ERROR Executor: Exception in task 0.0 in stage 14.0 (TID 
11)
java.lang.NullPointerException
at 
org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.generate_doConsume_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.smj_consumeFullOuterJoinRow_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.wholestagecodegen_findNextJoinRows_0$(Unknown
 Source)
at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown
 Source)
at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
...
```
The above full outer joins implicitly add an aliased coalesce to the parent 
projection of the join: `coalesce(v1.c1, v2.c1) as c1`. In the case where only 
one side's key is nullable, the coalesce's nullability is false. As a result, 
the generator's output has nullable set as false. But this is incorrect: If one 
side has a row with explicit null key values, the other side's row will also 
have null key values (because the other side's row will be "made up"), and both 
the `coalesce` and  [...]

While `UpdateNullability` actually repairs the nullability of the 
`coalesce` before execution, it doesn't recreate the generator output, so the 
nullability remains incorrect in `Generate#output`.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit test.

Closes #41809 from bersprockets/using_oddity2.

Authored-by: Bruce Robbins 
Signed-off-by: Yuming Wang 
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  | 10 --
 .../test/scala/org/apache/spark/sql/JoinSuite.scala| 18 ++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index bea7fe46c7c..7c91c2ee451 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3391,8 +3391,14 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
 (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ 
rUniqueOutput,
   leftKeys.map(_.withNullability(true)))
   case FullOuter =>
-// in full outer join, joinCols should be non-null if there is.
-val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, 
r)), l.name)() }
+// In full outer join, we should return non-null values for the join 
columns
+// if either side has non-null values for those columns. Therefore, 
for each
+// join column pair, add a coalesce to return the non-null value, if 
it exists.
+val joinedCols = joinPairs.map { case (l, r) =>
+  // Since this is a full outer join, either side could be 

[spark] branch master updated: [SPARK-43974][CONNECT][BUILD] Upgrade buf to v1.23.1

2023-07-10 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b0b12cf3028 [SPARK-43974][CONNECT][BUILD] Upgrade buf to v1.23.1
b0b12cf3028 is described below

commit b0b12cf3028331c12097f48cc857f4aad0e00d35
Author: panbingkun 
AuthorDate: Tue Jul 11 09:46:37 2023 +0800

[SPARK-43974][CONNECT][BUILD] Upgrade buf to v1.23.1

### What changes were proposed in this pull request?
The pr aims to upgrade buf from 1.20.0 to 1.23.1

### Why are the changes needed?
1.Release Notes:
- https://github.com/bufbuild/buf/releases/tag/v1.23.1
- https://github.com/bufbuild/buf/releases/tag/v1.23.0
- https://github.com/bufbuild/buf/releases/tag/v1.22.0
- https://github.com/bufbuild/buf/releases/tag/v1.21.0

2.The new version brings some bug fixed and improvment, as follow:
- Fix issue where buf beta graph would not print modules within a workspace 
that
had no dependencies or dependents.
- Fix issue where buf beta graph would print warnings for missing 
dependencies
that were actually present.
- Fix issue where locally-produced images did not have module information 
if the corresponding
module was stored in the new cache.
- Remove buf beta registry template.
- Remove buf beta registry plugin 
{create,deprecate,list,undeprecate,version} and replace with
buf beta registry plugin {push,delete}.
- Update buf beta price with the latest pricing information.

3.Manually test: dev/connect-gen-protos.sh, this upgrade will not change 
the generated files.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
- Manually test
- Pass GA

Closes #41469 from panbingkun/SPARK-43974.

Authored-by: panbingkun 
Signed-off-by: Ruifeng Zheng 
---
 .github/workflows/build_and_test.yml| 2 +-
 python/docs/source/development/contributing.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml 
b/.github/workflows/build_and_test.yml
index 85e477efd4e..23d5a94c320 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -631,7 +631,7 @@ jobs:
 - name: Install dependencies for Python code generation check
   run: |
 # See more in "Installation" 
https://docs.buf.build/installation#tarball
-curl -LO 
https://github.com/bufbuild/buf/releases/download/v1.20.0/buf-Linux-x86_64.tar.gz
+curl -LO 
https://github.com/bufbuild/buf/releases/download/v1.23.1/buf-Linux-x86_64.tar.gz
 mkdir -p $HOME/buf
 tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1
 python3.9 -m pip install 'protobuf==3.19.5' 'mypy-protobuf==3.3.0'
diff --git a/python/docs/source/development/contributing.rst 
b/python/docs/source/development/contributing.rst
index 32ae440711b..fa7bf11f6da 100644
--- a/python/docs/source/development/contributing.rst
+++ b/python/docs/source/development/contributing.rst
@@ -120,7 +120,7 @@ Prerequisite
 
 PySpark development requires to build Spark that needs a proper JDK installed, 
etc. See `Building Spark 
`_ for more details.
 
-Note that if you intend to contribute to Spark Connect in Python, ``buf`` 
version ``1.20.0`` is required, see `Buf Installation 
`_ for more details.
+Note that if you intend to contribute to Spark Connect in Python, ``buf`` 
version ``1.23.1`` is required, see `Buf Installation 
`_ for more details.
 
 Conda
 ~


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-44332][CORE][WEBUI] Fix the sorting error of Executor ID Column on Executors UI Page

2023-07-10 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9717d74d072 [SPARK-44332][CORE][WEBUI] Fix the sorting error of 
Executor ID Column on Executors UI Page
9717d74d072 is described below

commit 9717d74d0726bd177b8d0f0cc2c9b0404f82dafc
Author: panbingkun 
AuthorDate: Mon Jul 10 19:15:56 2023 -0500

[SPARK-44332][CORE][WEBUI] Fix the sorting error of Executor ID Column on 
Executors UI Page

### What changes were proposed in this pull request?
The pr aims to fix the sorting error of `Executor ID` Column on `Executor 
Page`.

### Why are the changes needed?
Fix UI Sort bug.
PS: Can be reproduced using: sh bin/spark-shell --master 
"local-cluster[12,1,1024]"

- Before patch
Before - asc:
https://github.com/apache/spark/assets/15246973/83648087-804a-4a62-8f3e-c748f46b95d7;>

Before - desc:
https://github.com/apache/spark/assets/15246973/b68547f3-af36-4e97-b922-7c3ffa3cbb30;>

- After patch
After - asc:
https://github.com/apache/spark/assets/15246973/9fd40fc7-9b72-4a08-8e16-a89d9625a1a0;>

After - desc:
https://github.com/apache/spark/assets/15246973/11921083-30cc-46e9-a9f6-1fe9aecde1a7;>

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
- Pass GA.
- Manually test.

Closes #41887 from panbingkun/align_executor_id.

Authored-by: panbingkun 
Signed-off-by: Sean Owen 
---
 .../org/apache/spark/ui/static/executorspage.js| 31 +++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git 
a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js 
b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index 520efbd6def..38dc446eaac 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -96,6 +96,32 @@ jQuery.extend(jQuery.fn.dataTableExt.oSort, {
   }
 });
 
+jQuery.extend( jQuery.fn.dataTableExt.oSort, {
+  "executor-id-asc": function ( a, b ) {
+if ($.isNumeric(a) && $.isNumeric(b)) {
+  return parseFloat(a) - parseFloat(b);
+} else if (!$.isNumeric(a) && $.isNumeric(b)) {
+  return -1;
+} else if ($.isNumeric(a) && !$.isNumeric(b)) {
+  return 1;
+} else {
+  return a.localeCompare(b);
+}
+  },
+
+  "executor-id-desc": function ( a, b ) {
+if ($.isNumeric(a) && $.isNumeric(b)) {
+  return parseFloat(b) - parseFloat(a);
+} else if (!$.isNumeric(a) && $.isNumeric(b)) {
+  return 1;
+} else if ($.isNumeric(a) && !$.isNumeric(b)) {
+  return -1;
+} else {
+  return b.localeCompare(a);
+}
+  }
+});
+
 $(document).ajaxStop($.unblockUI);
 $(document).ajaxStart(function () {
   $.blockUI({message: 'Loading Executors Page...'});
@@ -403,9 +429,8 @@ $(document).ready(function () {
   "data": response,
   "columns": [
 {
-  data: function (row, type) {
-return type !== 'display' ? (isNaN(row.id) ? 0 : row.id ) : 
row.id;
-  }
+  data: "id",
+  type: "executor-id"
 },
 {data: 'hostPort'},
 {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-43983][PYTHON][ML][CONNECT] Implement cross validator estimator

2023-07-10 Thread weichenxu123
This is an automated email from the ASF dual-hosted git repository.

weichenxu123 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 01918bb9017 [SPARK-43983][PYTHON][ML][CONNECT] Implement cross 
validator estimator
01918bb9017 is described below

commit 01918bb90170c13abd6c0f0f5c47f5d9bcc02adc
Author: Weichen Xu 
AuthorDate: Tue Jul 11 07:00:19 2023 +0800

[SPARK-43983][PYTHON][ML][CONNECT] Implement cross validator estimator

### What changes were proposed in this pull request?

Implement cross validator estimator for spark connect.

### Why are the changes needed?

Distributed ML on spark connect project.

### Does this PR introduce _any_ user-facing change?

Yes.

New class `pyspark.ml.connect.tuning.CrossValidator` and 
`pyspark.ml.connect.tuning.CrossValidatorModel` are added.

### How was this patch tested?

Unit tests.

Closes #41881 from WeichenXu123/SPARK-43983-cross-val.

Authored-by: Weichen Xu 
Signed-off-by: Weichen Xu 
---
 dev/sparktestsupport/modules.py|   2 +
 python/pyspark/ml/connect/__init__.py  |   2 +
 python/pyspark/ml/connect/base.py  |  18 +-
 python/pyspark/ml/connect/evaluation.py|  83 ++-
 python/pyspark/ml/connect/io_utils.py  |  76 ++-
 python/pyspark/ml/connect/pipeline.py  |  47 +-
 python/pyspark/ml/connect/tuning.py| 566 +
 .../ml/tests/connect/test_connect_tuning.py|  45 ++
 .../tests/connect/test_legacy_mode_evaluation.py   |  31 ++
 .../ml/tests/connect/test_legacy_mode_tuning.py| 267 ++
 10 files changed, 1080 insertions(+), 57 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 439ae40a0f8..2090546512f 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -620,6 +620,7 @@ pyspark_ml = Module(
 "pyspark.ml.tests.connect.test_legacy_mode_feature",
 "pyspark.ml.tests.connect.test_legacy_mode_classification",
 "pyspark.ml.tests.connect.test_legacy_mode_pipeline",
+"pyspark.ml.tests.connect.test_legacy_mode_tuning",
 ],
 excluded_python_implementations=[
 "PyPy"  # Skip these tests under PyPy since they require numpy and it 
isn't available there
@@ -866,6 +867,7 @@ pyspark_connect = Module(
 "pyspark.ml.tests.connect.test_connect_feature",
 "pyspark.ml.tests.connect.test_connect_classification",
 "pyspark.ml.tests.connect.test_connect_pipeline",
+"pyspark.ml.tests.connect.test_connect_tuning",
 ],
 excluded_python_implementations=[
 "PyPy"  # Skip these tests under PyPy since they require numpy, 
pandas, and pyarrow and
diff --git a/python/pyspark/ml/connect/__init__.py 
b/python/pyspark/ml/connect/__init__.py
index 2e048355d74..2ee152f6a38 100644
--- a/python/pyspark/ml/connect/__init__.py
+++ b/python/pyspark/ml/connect/__init__.py
@@ -26,6 +26,7 @@ from pyspark.ml.connect.base import (
 from pyspark.ml.connect import (
 feature,
 evaluation,
+tuning,
 )
 
 from pyspark.ml.connect.pipeline import Pipeline, PipelineModel
@@ -39,4 +40,5 @@ __all__ = [
 "evaluation",
 "Pipeline",
 "PipelineModel",
+"tuning",
 ]
diff --git a/python/pyspark/ml/connect/base.py 
b/python/pyspark/ml/connect/base.py
index f86b1e928c2..f8ce0cb6962 100644
--- a/python/pyspark/ml/connect/base.py
+++ b/python/pyspark/ml/connect/base.py
@@ -146,7 +146,9 @@ class Transformer(Params, metaclass=ABCMeta):
 """
 raise NotImplementedError()
 
-def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> 
Union[DataFrame, pd.DataFrame]:
+def transform(
+self, dataset: Union[DataFrame, pd.DataFrame], params: 
Optional["ParamMap"] = None
+) -> Union[DataFrame, pd.DataFrame]:
 """
 Transforms the input dataset.
 The dataset can be either pandas dataframe or spark dataframe,
@@ -163,12 +165,24 @@ class Transformer(Params, metaclass=ABCMeta):
 dataset : :py:class:`pyspark.sql.DataFrame` or 
py:class:`pandas.DataFrame`
 input dataset.
 
+params : dict, optional
+an optional param map that overrides embedded params.
+
 Returns
 ---
 :py:class:`pyspark.sql.DataFrame` or py:class:`pandas.DataFrame`
 transformed dataset, the type of output dataframe is consistent 
with
 input dataframe.
 """
+if params is None:
+params = dict()
+if isinstance(params, dict):
+if params:
+return self.copy(params)._transform(dataset)
+else:
+return self._transform(dataset)
+
+def _transform(self, dataset: Union[DataFrame, pd.DataFrame]) 

[spark] branch master updated (f6866d1c1ab -> 453300b418b)

2023-07-10 Thread hvanhovell
This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from f6866d1c1ab [SPARK-44343][CONNECT] Prepare ScalaReflection to the move 
to SQL/API
 add 453300b418b [SPARK-44352][CONNECT] Put back sameType and friends in 
DataType

No new revisions were added by this update.

Summary of changes:
 .../spark/sql/catalyst/types/DataTypeUtils.scala   | 40 ++-
 .../org/apache/spark/sql/types/DataType.scala  | 59 --
 .../org/apache/spark/sql/types/StructType.scala|  2 +-
 3 files changed, 60 insertions(+), 41 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-44343][CONNECT] Prepare ScalaReflection to the move to SQL/API

2023-07-10 Thread hvanhovell
This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new f6866d1c1ab [SPARK-44343][CONNECT] Prepare ScalaReflection to the move 
to SQL/API
f6866d1c1ab is described below

commit f6866d1c1ab986545f2b3c1fb254d6ca0d56c056
Author: Herman van Hovell 
AuthorDate: Mon Jul 10 14:45:04 2023 -0400

[SPARK-44343][CONNECT] Prepare ScalaReflection to the move to SQL/API

### What changes were proposed in this pull request?
This PR moves all catalyst specific internals out of ScalaReflection into 
other catalyst classes:
- Serializer Expression Generation is moved to `SerializerBuildHelper`.
- Deaerializer Expression Generation is moved to `DeserializerBuildHelper`.
- Common utils are moved to `EncoderUtils`.

### Why are the changes needed?
We want to use ScalaReflection based encoder inference both for SQL/Core 
and Connect.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing tests.

Closes #41920 from hvanhovell/SPARK-44343.

Authored-by: Herman van Hovell 
Signed-off-by: Herman van Hovell 
---
 .../sql/catalyst/DeserializerBuildHelper.scala | 254 -
 .../spark/sql/catalyst/ScalaReflection.scala   | 573 +
 .../spark/sql/catalyst/SerializerBuildHelper.scala | 199 ++-
 .../spark/sql/catalyst/encoders/EncoderUtils.scala | 139 +
 .../sql/catalyst/encoders/ExpressionEncoder.scala  |   6 +-
 .../catalyst/expressions/V2ExpressionUtils.scala   |   5 +-
 .../sql/catalyst/expressions/objects/objects.scala |  20 +-
 .../spark/sql/catalyst/ScalaReflectionSuite.scala  |   4 +-
 .../expressions/ObjectExpressionsSuite.scala   |   6 +-
 .../datasources/parquet/ParquetIOSuite.scala   |   4 +-
 .../datasources/parquet/ParquetSchemaSuite.scala   |  18 +-
 .../datasources/parquet/ParquetTest.scala  |   3 +
 .../apache/spark/sql/internal/CatalogSuite.scala   |  16 +-
 13 files changed, 630 insertions(+), 617 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala
index 41fd5bb239d..bdf996424ad 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala
@@ -17,10 +17,14 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
-import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
UpCast}
-import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, 
Invoke, StaticInvoke}
-import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils}
+import org.apache.spark.sql.catalyst.{expressions => exprs}
+import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, 
UnresolvedExtractValue}
+import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, 
AgnosticEncoders}
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, 
BoxedLeafEncoder, DateEncoder, DayTimeIntervalEncoder, InstantEncoder, 
IterableEncoder, JavaBeanEncoder, JavaBigIntEncoder, JavaDecimalEncoder, 
JavaEnumEncoder, LocalDateEncoder, LocalDateTimeEncoder, MapEncoder, 
OptionEncoder, PrimitiveBooleanEncoder, PrimitiveByteEncoder, 
PrimitiveDoubleEncoder, PrimitiveFloatEncoder, PrimitiveIntEncoder, 
PrimitiveLongEncoder, PrimitiveShortEncoder, ProductEncoder, ScalaBigIn [...]
+import 
org.apache.spark.sql.catalyst.encoders.EncoderUtils.{externalDataTypeFor, 
isNativeEncoder}
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
IsNull, MapKeys, MapValues, UpCast}
+import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, 
CreateExternalRow, InitializeJavaBean, Invoke, NewInstance, StaticInvoke, 
UnresolvedCatalystToExternalMap, UnresolvedMapObjects, WrapOption}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, 
IntervalUtils}
 import org.apache.spark.sql.types._
 
 object DeserializerBuildHelper {
@@ -193,4 +197,246 @@ object DeserializerBuildHelper {
   UpCast(expr, DecimalType, walkedTypePath.getPaths)
 case _ => UpCast(expr, expected, walkedTypePath.getPaths)
   }
+
+  /**
+   * Returns an expression for deserializing the Spark SQL representation of 
an object into its
+   * external form. The mapping between the internal and external 
representations is
+   * described by encoder `enc`. The Spark SQL representation is located at 
ordinal 0 of
+   * a row, i.e., `GetColumnByOrdinal(0, _)`. Nested classes will have their 
fields accessed using
+   * `UnresolvedExtractValue`.
+   *
+   * The returned 

[spark] branch master updated: [SPARK-44350][BUILD] Upgrade sbt to 1.9.2

2023-07-10 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new f1ec99b10ca [SPARK-44350][BUILD] Upgrade sbt to 1.9.2
f1ec99b10ca is described below

commit f1ec99b10caf85e95aec2ed4f1e0b55cc0bd6f11
Author: panbingkun 
AuthorDate: Mon Jul 10 13:29:33 2023 -0500

[SPARK-44350][BUILD] Upgrade sbt to 1.9.2

### What changes were proposed in this pull request?
The pr aims to upgrade sbt from 1.9.1 to 1.9.2.

### Why are the changes needed?
1.The new version brings bug fixed:
- Let ++ fall back to a bincompat Scala version by eed3si9n in 
https://github.com/sbt/sbt/pull/7328

2.v1.9.1 VS v1.9.2
https://github.com/sbt/sbt/compare/v1.9.1...v1.9.2

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Pass GA.

Closes #41916 from panbingkun/upgrade_sbt_192.

Authored-by: panbingkun 
Signed-off-by: Sean Owen 
---
 dev/appveyor-install-dependencies.ps1 | 2 +-
 project/build.properties  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/appveyor-install-dependencies.ps1 
b/dev/appveyor-install-dependencies.ps1
index 6848d3af43d..3737382eb86 100644
--- a/dev/appveyor-install-dependencies.ps1
+++ b/dev/appveyor-install-dependencies.ps1
@@ -97,7 +97,7 @@ if (!(Test-Path $tools)) {
 # == SBT
 Push-Location $tools
 
-$sbtVer = "1.9.1"
+$sbtVer = "1.9.2"
 Start-FileDownload 
"https://github.com/sbt/sbt/releases/download/v$sbtVer/sbt-$sbtVer.zip; 
"sbt.zip"
 
 # extract
diff --git a/project/build.properties b/project/build.properties
index f27c9c4c8cc..3eb34b94744 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -15,4 +15,4 @@
 # limitations under the License.
 #
 # Please update the version in appveyor-install-dependencies.ps1 together.
-sbt.version=1.9.1
+sbt.version=1.9.2


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-44351][SQL] Make some syntactic simplification

2023-07-10 Thread yangjie01
This is an automated email from the ASF dual-hosted git repository.

yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 6835b30eb21 [SPARK-44351][SQL] Make some syntactic simplification
6835b30eb21 is described below

commit 6835b30eb21de74de4aed042f2a32f3c86d6029b
Author: yangjie01 
AuthorDate: Mon Jul 10 23:05:00 2023 +0800

[SPARK-44351][SQL] Make some syntactic simplification

### What changes were proposed in this pull request?
This pr aims make some syntactic simplification:

- Use `exists` instead of `find` and `emptiness check`
- Use `orNull` instead of `etOrElse(null)`
- Use `getOrElse(key, value)` instead of `get(key).getOrElse(value)` on map
- Use `find` instead of `filter` + `headOption`

### Why are the changes needed?
Code simplification.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
- Pass Git Hub Actions

Closes #41915 from LuciferYang/syntactic-simplification.

Lead-authored-by: yangjie01 
Co-authored-by: YangJie 
Signed-off-by: yangjie01 
---
 .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 +-
 .../scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala | 2 +-
 .../scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 2 +-
 .../org/apache/spark/sql/execution/datasources/FileFormat.scala | 6 +++---
 .../org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala | 3 +--
 .../execution/datasources/jdbc/connection/ConnectionProvider.scala  | 2 +-
 6 files changed, 8 insertions(+), 9 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 94d341ed1d7..bea7fe46c7c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1366,7 +1366,7 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
   // column names. We need to make sure the static partition 
column name doesn't appear
   // there to catch the following ambiguous query:
   // INSERT OVERWRITE t PARTITION (c='1') (c) VALUES ('2')
-  if (query.output.find(col => conf.resolver(col.name, 
staticName)).nonEmpty) {
+  if (query.output.exists(col => conf.resolver(col.name, 
staticName))) {
 throw 
QueryCompilationErrors.staticPartitionInUserSpecifiedColumnsError(staticName)
   }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala
index bdffce0b9af..4aa5c3ebf5a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala
@@ -84,7 +84,7 @@ object ResolveUnion extends Rule[LogicalPlan] {
 }
 
 colType.fields
-  .filter(f => targetType.fields.find(tf => resolver(f.name, 
tf.name)).isEmpty)
+  .filter(f => !targetType.fields.exists(tf => resolver(f.name, tf.name)))
   .foreach { f =>
 newStructFields ++= Literal(f.name) :: ExtractValue(col, 
Literal(f.name), resolver) :: Nil
   }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 99fa0bf9809..0eeef48b071 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -3258,7 +3258,7 @@ class AstBuilder extends DataTypeAstBuilder with 
SQLConfHelper with Logging {
   ctx: ExpressionPropertyListContext): OptionList = {
 val options = ctx.expressionProperty.asScala.map { property =>
   val key: String = visitPropertyKey(property.key)
-  val value: Expression = 
Option(property.value).map(expression).getOrElse(null)
+  val value: Expression = Option(property.value).map(expression).orNull
   key -> value
 }.toSeq
 OptionList(options)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index f76ea30e04c..2e71c829115 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -292,9 +292,9 @@ object FileFormat {
   name: String,
  

[spark] branch master updated: [SPARK-44271][SQL] Move default values functions from StructType to ResolveDefaultColumns

2023-07-10 Thread hvanhovell
This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new c37d7dec8aa [SPARK-44271][SQL] Move default values functions from 
StructType to ResolveDefaultColumns
c37d7dec8aa is described below

commit c37d7dec8aa4d703b6dac9b9d60ff25d9d5dc665
Author: Rui Wang 
AuthorDate: Mon Jul 10 06:23:03 2023 -0400

[SPARK-44271][SQL] Move default values functions from StructType to 
ResolveDefaultColumns

### What changes were proposed in this pull request?

Move default values functions from StructType to ResolveDefaultColumns.

### Why are the changes needed?

To simply DataType interface.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Existing test

Closes #41820 from amaliujia/clean_up_left_errors.

Authored-by: Rui Wang 
Signed-off-by: Herman van Hovell 
---
 .../spark/sql/catalyst/csv/UnivocityParser.scala   |  2 +-
 .../spark/sql/catalyst/json/JacksonParser.scala|  7 +++--
 .../catalyst/util/ResolveDefaultColumnsUtil.scala  | 32 --
 .../org/apache/spark/sql/types/StructType.scala|  8 --
 .../apache/spark/sql/types/StructTypeSuite.scala   | 16 +--
 .../datasources/orc/OrcColumnarBatchReader.java|  3 +-
 .../parquet/VectorizedParquetRecordReader.java |  3 +-
 .../datasources/orc/OrcDeserializer.scala  | 15 ++
 .../datasources/parquet/ParquetRowConverter.scala  | 15 ++
 9 files changed, 58 insertions(+), 43 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index b58649da61c..a02d57c0bc7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -353,7 +353,7 @@ class UnivocityParser(
 case NonFatal(e) =>
   badRecordException = badRecordException.orElse(Some(e))
   // Use the corresponding DEFAULT value associated with the column, 
if any.
-  row.update(i, requiredSchema.existenceDefaultValues(i))
+  row.update(i, 
ResolveDefaultColumns.existenceDefaultValues(requiredSchema)(i))
   }
   i += 1
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index 5286e16b088..03dce431837 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -448,14 +448,15 @@ class JacksonParser(
 var skipRow = false
 
 structFilters.reset()
-resetExistenceDefaultsBitmask(schema)
+lazy val bitmask = ResolveDefaultColumns.existenceDefaultsBitmask(schema)
+resetExistenceDefaultsBitmask(schema, bitmask)
 while (!skipRow && nextUntil(parser, JsonToken.END_OBJECT)) {
   schema.getFieldIndex(parser.getCurrentName) match {
 case Some(index) =>
   try {
 row.update(index, fieldConverters(index).apply(parser))
 skipRow = structFilters.skipRow(row, index)
-schema.existenceDefaultsBitmask(index) = false
+bitmask(index) = false
   } catch {
 case e: SparkUpgradeException => throw e
 case NonFatal(e) if isRoot || enablePartialResults =>
@@ -469,7 +470,7 @@ class JacksonParser(
 if (skipRow) {
   None
 } else if (badRecordException.isEmpty) {
-  applyExistenceDefaultValuesToRow(schema, row)
+  applyExistenceDefaultValuesToRow(schema, row, bitmask)
   Some(row)
 } else {
   throw PartialResultException(row, badRecordException.get)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala
index 26efa8c8df2..6489fb9aaaf 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ResolveDefaultColumnsUtil.scala
@@ -396,27 +396,30 @@ object ResolveDefaultColumns extends QueryErrorsBase {
* above, for convenience.
*/
   def getExistenceDefaultsBitmask(schema: StructType): Array[Boolean] = {
-Array.fill[Boolean](schema.existenceDefaultValues.size)(true)
+Array.fill[Boolean](existenceDefaultValues(schema).size)(true)
   }
 
   /**
* Resets the elements of the array initially returned from 
[[getExistenceDefaultsBitmask]] above.
* 

[spark] branch master updated: [SPARK-44131][SQL] Add call_function and deprecate call_udf for Scala API

2023-07-10 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 46dd3aa9425 [SPARK-44131][SQL] Add call_function and deprecate 
call_udf for Scala API
46dd3aa9425 is described below

commit 46dd3aa94250343b38d963d74ae10aba255a6a24
Author: Jiaan Geng 
AuthorDate: Mon Jul 10 18:11:14 2023 +0800

[SPARK-44131][SQL] Add call_function and deprecate call_udf for Scala API

### What changes were proposed in this pull request?
The Scala API exists a method `call_udf` used to call the user-defined 
functions.
In fact, `call_udf` also could call the builtin functions.
The behavior is confused for users.

This PR adds `call_function` to replace `call_udf` and deprecate `call_udf` 
for Scala API.

### Why are the changes needed?
Fix the confusion of `call_udf`.

### Does this PR introduce _any_ user-facing change?
'No'.
New feature.

### How was this patch tested?
Exists test cases.

Closes #41687 from beliefer/SPARK-44131.

Authored-by: Jiaan Geng 
Signed-off-by: Ruifeng Zheng 
---
 .../scala/org/apache/spark/sql/functions.scala |  12 +++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   4 +
 .../explain-results/function_call_function.explain |   2 +
 .../queries/function_call_function.json|  25 ++
 .../queries/function_call_function.proto.bin   | Bin 0 -> 174 bytes
 .../source/reference/pyspark.sql/functions.rst |   5 +-
 python/pyspark/sql/connect/functions.py|   9 ++-
 python/pyspark/sql/functions.py|  53 
 .../scala/org/apache/spark/sql/functions.scala |  90 +
 .../apache/spark/sql/DataFrameFunctionsSuite.scala |   7 +-
 10 files changed, 150 insertions(+), 57 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 5240cdecb01..b0ae4c9752a 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -7905,4 +7905,16 @@ object functions {
   }
   // scalastyle:off line.size.limit
 
+  /**
+   * Call a builtin or temp function.
+   *
+   * @param funcName
+   *   function name
+   * @param cols
+   *   the expression parameters of function
+   * @since 3.5.0
+   */
+  @scala.annotation.varargs
+  def call_function(funcName: String, cols: Column*): Column = 
Column.fn(funcName, cols: _*)
+
 }
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 1d679653166..7e4e0f24f4f 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2873,6 +2873,10 @@ class PlanGenerationTestSuite
 fn.random(lit(1))
   }
 
+  functionTest("call_function") {
+fn.call_function("lower", fn.col("g"))
+  }
+
   test("hll_sketch_agg with column lgConfigK") {
 binary.select(fn.hll_sketch_agg(fn.col("bytes"), lit(0)))
   }
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_call_function.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_call_function.explain
new file mode 100644
index 000..d905689c35d
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_call_function.explain
@@ -0,0 +1,2 @@
+Project [lower(g#0) AS lower(g)#0]
++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_call_function.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_call_function.json
new file mode 100644
index 000..f7fe5beba2c
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/function_call_function.json
@@ -0,0 +1,25 @@
+{
+  "common": {
+"planId": "1"
+  },
+  "project": {
+"input": {
+  "common": {
+"planId": "0"
+  },
+  "localRelation": {
+"schema": 
"struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+  }
+},
+"expressions": [{
+  "unresolvedFunction": {
+"functionName": "lower",
+"arguments": [{
+  "unresolvedAttribute": {
+"unparsedIdentifier": "g"
+  }
+}]
+

[spark] branch master updated: [SPARK-44194][DOCS][FOLLOWUP] Add missing `versionadded` annotations for JobTag APIs

2023-07-10 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b1af89de13f [SPARK-44194][DOCS][FOLLOWUP] Add missing `versionadded` 
annotations for JobTag APIs
b1af89de13f is described below

commit b1af89de13f34190686fcb321a0092ab1ceb2908
Author: Ruifeng Zheng 
AuthorDate: Mon Jul 10 17:20:13 2023 +0800

[SPARK-44194][DOCS][FOLLOWUP] Add missing `versionadded` annotations for 
JobTag APIs

### What changes were proposed in this pull request?
Add missing `versionadded` annotations

### Why are the changes needed?
to improve docs

### Does this PR introduce _any_ user-facing change?
yes

### How was this patch tested?
existing tests

Closes #41917 from zhengruifeng/job_tag_since.

Authored-by: Ruifeng Zheng 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/context.py | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 4867ce2ae29..81adb6ced33 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -2193,6 +2193,8 @@ class SparkContext:
 """
 Add a tag to be assigned to all the jobs started by this thread.
 
+.. versionadded:: 3.5.0
+
 Parameters
 --
 tag : str
@@ -2246,6 +2248,8 @@ class SparkContext:
 Remove a tag previously added to be assigned to all the jobs started 
by this thread.
 Noop if such a tag was not added earlier.
 
+.. versionadded:: 3.5.0
+
 Parameters
 --
 tag : str
@@ -2276,6 +2280,8 @@ class SparkContext:
 """
 Get the tags that are currently set to be assigned to all the jobs 
started by this thread.
 
+.. versionadded:: 3.5.0
+
 Returns
 ---
 set of str
@@ -2302,6 +2308,8 @@ class SparkContext:
 """
 Clear the current thread's job tags.
 
+.. versionadded:: 3.5.0
+
 See Also
 
 :meth:`SparkContext.addJobTag`
@@ -2406,6 +2414,8 @@ class SparkContext:
 Cancel active jobs that have the specified tag. See
 :meth:`SparkContext.addJobTag`.
 
+.. versionadded:: 3.5.0
+
 Parameters
 --
 tag : str


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-44267][PS][INFRA] Upgrade `pandas` to 2.0.3

2023-07-10 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 5dbd6ff6aa7 [SPARK-44267][PS][INFRA] Upgrade `pandas` to 2.0.3
5dbd6ff6aa7 is described below

commit 5dbd6ff6aa714f0e2e065f41dcb68b7f793caa86
Author: panbingkun 
AuthorDate: Mon Jul 10 15:43:26 2023 +0900

[SPARK-44267][PS][INFRA] Upgrade `pandas` to 2.0.3

### What changes were proposed in this pull request?
The pr aims to upgrade `pandas` from 2.0.2 to 2.0.3.

### Why are the changes needed?
1.The new version brings some bug fixed, eg:
- Bug in DataFrame.convert_dtype() and Series.convert_dtype() when trying 
to convert 
[ArrowDtype](https://pandas.pydata.org/docs/reference/api/pandas.ArrowDtype.html#pandas.ArrowDtype)
 with dtype_backend="nullable_numpy" 
([GH53648](https://github.com/pandas-dev/pandas/issues/53648))

- Bug in 
[read_csv()](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas.read_csv)
 when defining dtype with bool[pyarrow] for the "c" and "python" engines 
([GH53390](https://github.com/pandas-dev/pandas/issues/53390))

2.Release notes:
https://pandas.pydata.org/docs/whatsnew/v2.0.3.html

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Pass GA.

Closes #41812 from panbingkun/SPARK-44267.

Authored-by: panbingkun 
Signed-off-by: Hyukjin Kwon 
---
 dev/infra/Dockerfile  | 4 ++--
 python/pyspark/pandas/supported_api_gen.py| 2 +-
 python/pyspark/pandas/tests/groupby/test_aggregate.py | 5 +
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
index 3b95467389a..af8e1a980f9 100644
--- a/dev/infra/Dockerfile
+++ b/dev/infra/Dockerfile
@@ -64,8 +64,8 @@ RUN Rscript -e "devtools::install_version('roxygen2', 
version='7.2.0', repos='ht
 # See more in SPARK-39735
 ENV R_LIBS_SITE 
"/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
 
-RUN pypy3 -m pip install numpy 'pandas<=2.0.2' scipy coverage matplotlib
-RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.2' scipy 
unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 
'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
+RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib
+RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.3' scipy 
unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 
'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
 
 # Add Python deps for Spark Connect.
 RUN python3.9 -m pip install grpcio protobuf googleapis-common-protos 
grpcio-status
diff --git a/python/pyspark/pandas/supported_api_gen.py 
b/python/pyspark/pandas/supported_api_gen.py
index d259171ecb9..06591c5b26a 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -98,7 +98,7 @@ def generate_supported_api(output_rst_file_path: str) -> None:
 
 Write supported APIs documentation.
 """
-pandas_latest_version = "2.0.2"
+pandas_latest_version = "2.0.3"
 if LooseVersion(pd.__version__) != LooseVersion(pandas_latest_version):
 msg = (
 "Warning: Latest version of pandas (%s) is required to generate 
the documentation; "
diff --git a/python/pyspark/pandas/tests/groupby/test_aggregate.py 
b/python/pyspark/pandas/tests/groupby/test_aggregate.py
index bb5b165306d..6ceae82caa8 100644
--- a/python/pyspark/pandas/tests/groupby/test_aggregate.py
+++ b/python/pyspark/pandas/tests/groupby/test_aggregate.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 import unittest
+from distutils.version import LooseVersion
 
 import pandas as pd
 
@@ -39,6 +40,10 @@ class GroupbyAggregateMixin:
 def psdf(self):
 return ps.from_pandas(self.pdf)
 
+@unittest.skipIf(
+LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
+"TODO(SPARK-44289): Enable GroupbyAggregateTests.test_aggregate for 
pandas 2.0.0.",
+)
 def test_aggregate(self):
 pdf = pd.DataFrame(
 {"A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": [0.362, 0.227, 1.267, 
-0.562]}


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (b7c6c846c08 -> 77a4aa05720)

2023-07-10 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from b7c6c846c08 [SPARK-44328][SQL] Assign names to the error class 
_LEGACY_ERROR_TEMP_[2325-2328]
 add 77a4aa05720 [SPARK-44337][PROTOBUF] Any fields set to 
'Any.getDefaultInstance' cause parse errors

No new revisions were added by this update.

Summary of changes:
 .../org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala   | 11 ++-
 .../apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala| 10 --
 2 files changed, 18 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (990affdd503 -> b7c6c846c08)

2023-07-10 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 990affdd503 [SPARK-44290][CONNECT][FOLLOW-UP] Skip flaky tests, and 
fix a typo in session UUID together
 add b7c6c846c08 [SPARK-44328][SQL] Assign names to the error class 
_LEGACY_ERROR_TEMP_[2325-2328]

No new revisions were added by this update.

Summary of changes:
 .../src/main/resources/error/error-classes.json| 57 +
 ...-conditions-cannot-update-field-error-class.md} | 26 
 docs/sql-error-conditions.md   | 14 ++--
 .../sql/catalyst/analysis/CheckAnalysis.scala  | 18 --
 .../spark/sql/connector/AlterTableTests.scala  | 74 --
 5 files changed, 120 insertions(+), 69 deletions(-)
 copy docs/{sql-error-conditions-invalid-limit-like-expression-error-class.md 
=> sql-error-conditions-cannot-update-field-error-class.md} (65%)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-44290][CONNECT][FOLLOW-UP] Skip flaky tests, and fix a typo in session UUID together

2023-07-10 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 990affdd503 [SPARK-44290][CONNECT][FOLLOW-UP] Skip flaky tests, and 
fix a typo in session UUID together
990affdd503 is described below

commit 990affdd503fa792f6ae839c87cded10d90df54d
Author: Hyukjin Kwon 
AuthorDate: Mon Jul 10 15:01:36 2023 +0900

[SPARK-44290][CONNECT][FOLLOW-UP] Skip flaky tests, and fix a typo in 
session UUID together

### What changes were proposed in this pull request?

This PR is a followup of https://github.com/apache/spark/pull/41495 that 
skips a couple of flaky tests. In addition, this PR fixes a typo together.

### Why are the changes needed?

To keep the tests green. In order to reenable the tests, it needs other 
fixes together that might refactor the whole test cases which takes a while. I 
will followup and fix them in SPARK-44348

### Does this PR introduce _any_ user-facing change?

No, the feature is not released to end users yet.

### How was this patch tested?

Unittests skipped for now.

Closes #41913 from HyukjinKwon/SPARK-44290-followup.

Lead-authored-by: Hyukjin Kwon 
Co-authored-by: Kent Yao 
Signed-off-by: Hyukjin Kwon 
---
 .../main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala  | 4 ++--
 python/pyspark/sql/tests/connect/client/test_artifact.py  | 4 
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala 
b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index d6dcd906d92..1f5c079f999 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -157,7 +157,7 @@ private[spark] class PythonWorkerFactory(pythonExec: 
String, envVars: Map[String
 
   // Create and start the worker
   val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", 
workerModule))
-  val sessionId = envVars.getOrElse("SPARK_CONNECT_SESSION_UUID", 
"deafult")
+  val sessionId = envVars.getOrElse("SPARK_CONNECT_SESSION_UUID", 
"default")
   if (sessionId != "default") {
 pb.directory(new File(SparkFiles.getRootDirectory(), sessionId))
   }
@@ -214,7 +214,7 @@ private[spark] class PythonWorkerFactory(pythonExec: 
String, envVars: Map[String
 // Create and start the daemon
 val command = Arrays.asList(pythonExec, "-m", daemonModule)
 val pb = new ProcessBuilder(command)
-val sessionId = envVars.getOrElse("SPARK_CONNECT_SESSION_UUID", 
"deafult")
+val sessionId = envVars.getOrElse("SPARK_CONNECT_SESSION_UUID", 
"default")
 if (sessionId != "default") {
   pb.directory(new File(SparkFiles.getRootDirectory(), sessionId))
 }
diff --git a/python/pyspark/sql/tests/connect/client/test_artifact.py 
b/python/pyspark/sql/tests/connect/client/test_artifact.py
index cbd00acf829..c685000b5ea 100644
--- a/python/pyspark/sql/tests/connect/client/test_artifact.py
+++ b/python/pyspark/sql/tests/connect/client/test_artifact.py
@@ -245,6 +245,7 @@ class ArtifactTests(ReusedConnectTestCase):
 spark_session.addArtifacts(pyfile_path, pyfile=True)
 
self.assertEqual(spark_session.range(1).select(func("id")).first()[0], 10)
 
+@unittest.skip("SPARK-44348: Reenable Session-based artifact test cases")
 def test_add_pyfile(self):
 self.check_add_pyfile(self.spark)
 
@@ -272,6 +273,7 @@ class ArtifactTests(ReusedConnectTestCase):
 spark_session.addArtifacts(f"{package_path}.zip", pyfile=True)
 
self.assertEqual(spark_session.range(1).select(func("id")).first()[0], 5)
 
+@unittest.skip("SPARK-44348: Reenable Session-based artifact test cases")
 def test_add_zipped_package(self):
 self.check_add_zipped_package(self.spark)
 
@@ -303,6 +305,7 @@ class ArtifactTests(ReusedConnectTestCase):
 spark_session.addArtifacts(f"{archive_path}.zip#my_files", 
archive=True)
 
self.assertEqual(spark_session.range(1).select(func("id")).first()[0], "hello 
world!")
 
+@unittest.skip("SPARK-44348: Reenable Session-based artifact test cases")
 def test_add_archive(self):
 self.check_add_archive(self.spark)
 
@@ -328,6 +331,7 @@ class ArtifactTests(ReusedConnectTestCase):
 spark_session.addArtifacts(file_path, file=True)
 
self.assertEqual(spark_session.range(1).select(func("id")).first()[0], "Hello 
world!!")
 
+@unittest.skip("SPARK-44348: Reenable Session-based artifact test cases")
 def test_add_file(self):
 self.check_add_file(self.spark)