[spark] branch master updated (e54e3ab8654 -> 9d28909a090)

2022-10-13 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from e54e3ab8654 [SPARK-40780][SQL][CONNECT][TESTS][FOLLOW-UP] Update 
Scaladoc at SimpleAnalyzer
 add 9d28909a090 [SPARK-40788][SQL][TESTS] Check error classes in 
CreateNamespaceParserSuite

No new revisions were added by this update.

Summary of changes:
 .../command/CreateNamespaceParserSuite.scala   | 94 --
 1 file changed, 69 insertions(+), 25 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (39e08b10c24 -> e54e3ab8654)

2022-10-13 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 39e08b10c24 [SPARK-40557][CONNECT][FOLLOW-UP] Sync python generated 
proto files
 add e54e3ab8654 [SPARK-40780][SQL][CONNECT][TESTS][FOLLOW-UP] Update 
Scaladoc at SimpleAnalyzer

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40557][CONNECT][FOLLOW-UP] Sync python generated proto files

2022-10-13 Thread ruifengz
This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 39e08b10c24 [SPARK-40557][CONNECT][FOLLOW-UP] Sync python generated 
proto files
39e08b10c24 is described below

commit 39e08b10c246ca1d47ed6adb8992802bd1113657
Author: Rui Wang 
AuthorDate: Fri Oct 14 12:03:11 2022 +0800

[SPARK-40557][CONNECT][FOLLOW-UP] Sync python generated proto files

### What changes were proposed in this pull request?

This PR syncs python generated proto files. The proto changes in this file 
is generated by 
https://github.com/apache/spark/blob/master/connector/connect/dev/generate_protos.sh.

### Why are the changes needed?

Python client side proto files are out of sync. Other python related PRs 
needs to re-generate proto files which has caused troubles on code review.

We are looking for ways to automatically keep the python proto files in 
sync. Before that is done, we need to manually update the proto files.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

UT

Closes #38244 from amaliujia/sync_python_proto.

Authored-by: Rui Wang 
Signed-off-by: Ruifeng Zheng 
---
 python/pyspark/sql/connect/plan.py |   4 +-
 python/pyspark/sql/connect/proto/commands_pb2.py   |   8 +-
 python/pyspark/sql/connect/proto/commands_pb2.pyi  |   8 +-
 .../pyspark/sql/connect/proto/expressions_pb2.py   |  62 +++---
 .../pyspark/sql/connect/proto/expressions_pb2.pyi  |  90 +++-
 python/pyspark/sql/connect/proto/relations_pb2.py  |  82 
 python/pyspark/sql/connect/proto/relations_pb2.pyi | 164 ---
 python/pyspark/sql/connect/proto/types_pb2.py  | 102 -
 python/pyspark/sql/connect/proto/types_pb2.pyi | 232 +++--
 9 files changed, 424 insertions(+), 328 deletions(-)

diff --git a/python/pyspark/sql/connect/plan.py 
b/python/pyspark/sql/connect/plan.py
index 09f6680a416..67ed6b964fa 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -319,9 +319,9 @@ class Aggregate(LogicalPlan):
 
 def _convert_measure(
 self, m: MeasureType, session: Optional["RemoteSparkSession"]
-) -> proto.Aggregate.Measure:
+) -> proto.Aggregate.AggregateFunction:
 exp, fun = m
-measure = proto.Aggregate.Measure()
+measure = proto.Aggregate.AggregateFunction()
 measure.function.name = fun
 if type(exp) is str:
 measure.function.arguments.append(self.unresolved_attr(exp))
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py 
b/python/pyspark/sql/connect/proto/commands_pb2.py
index 46d405dd008..875f5d02db2 100644
--- a/python/pyspark/sql/connect/proto/commands_pb2.py
+++ b/python/pyspark/sql/connect/proto/commands_pb2.py
@@ -32,7 +32,7 @@ from pyspark.sql.connect.proto import types_pb2 as 
spark_dot_connect_dot_types__
 
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19spark/connect/types.proto"i\n\x07\x43ommand\x12N\n\x0f\x63reate_function\x18\x01
 
\x01(\x0b\x32#.spark.connect.CreateScalarFunctionH\x00R\x0e\x63reateFunctionB\x0e\n\x0c\x63ommand_type"\x8f\x04\n\x14\x43reateScalarFunction\x12\x14\n\x05parts\x18\x01
 \x03(\tR\x05parts\x12P\n\x08language\x18\x02 
\x01(\x0e\x32\x34.spark.connect.CreateScalarFunction.FunctionLanguageR\x08language\x12\x1c\n\ttemporary\x18\x03
 \x01(\x08R\ttempora [...]
+
b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19spark/connect/types.proto"i\n\x07\x43ommand\x12N\n\x0f\x63reate_function\x18\x01
 
\x01(\x0b\x32#.spark.connect.CreateScalarFunctionH\x00R\x0e\x63reateFunctionB\x0e\n\x0c\x63ommand_type"\x97\x04\n\x14\x43reateScalarFunction\x12\x14\n\x05parts\x18\x01
 \x03(\tR\x05parts\x12P\n\x08language\x18\x02 
\x01(\x0e\x32\x34.spark.connect.CreateScalarFunction.FunctionLanguageR\x08language\x12\x1c\n\ttemporary\x18\x03
 \x01(\x08R\ttempora [...]
 )
 
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -44,7 +44,7 @@ if _descriptor._USE_C_DESCRIPTORS == False:
 _COMMAND._serialized_start = 74
 _COMMAND._serialized_end = 179
 _CREATESCALARFUNCTION._serialized_start = 182
-_CREATESCALARFUNCTION._serialized_end = 709
-_CREATESCALARFUNCTION_FUNCTIONLANGUAGE._serialized_start = 547
-_CREATESCALARFUNCTION_FUNCTIONLANGUAGE._serialized_end = 686
+_CREATESCALARFUNCTION._serialized_end = 717
+_CREATESCALARFUNCTION_FUNCTIONLANGUAGE._serialized_start = 555
+_CREATESCALARFUNCTION_FUNCTIONLANGUAGE._serialized_end = 694
 # @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi 
b/python/pyspark/sql/connect/proto/commands_pb2.pyi
index 

[spark-docker] branch master updated: [SPARK-40783][INFRA] Enable Spark on K8s integration test

2022-10-13 Thread yikun
This is an automated email from the ASF dual-hosted git repository.

yikun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark-docker.git


The following commit(s) were added to refs/heads/master by this push:
 new 3037f75  [SPARK-40783][INFRA] Enable Spark on K8s integration test
3037f75 is described below

commit 3037f75a88ca7ea57746c7d1bf49c125a828f56e
Author: Yikun Jiang 
AuthorDate: Fri Oct 14 11:57:01 2022 +0800

[SPARK-40783][INFRA] Enable Spark on K8s integration test

### What changes were proposed in this pull request?
This patch enable the Spark on K8s integration test:

- **scala2.12-java11-python3-ubuntu**: Run scala / PySpark basic test
- **scala2.12-java11-ubuntu**: Run scala basic test
- **scala2.12-java11-r-ubuntu**: Run scala / SparkR basic test
- **scala2.12-java11-python3-r-ubuntu**: Run all K8s integration test

Currently, we use the local registry as a bridge between build and test:
https://user-images.githubusercontent.com/1736354/195758243-abfbea7f-05e9-4678-a3a5-cfd38cc1b8f5.png;>

- Build: generate the image and push to local registry
- Test: load to minikube docker, run K8s test using specific image

Due to the multi-platform images cannot be exported with the `docker` 
export type, the local registry (push) is used here rather than local build 
(load). Compare to `ghcr` it reduces the network transmition and permission 
required.

Also:
- Upgrade `setup-qemu-action` to v2
- Upgrade `setup-buildx-action` to v2
- Remove ununsed `Image digest` step

### Why are the changes needed?
To ensure the quality of official dockerfiles.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

Closes #9 from Yikun/enable-k8s-it.

Authored-by: Yikun Jiang 
Signed-off-by: Yikun Jiang 
---
 .github/workflows/main.yml | 142 -
 1 file changed, 129 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 7972703..b47245b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -41,6 +41,15 @@ on:
 jobs:
   main:
 runs-on: ubuntu-latest
+# Due to the multi-platform images cannot be exported with the `docker` 
export type,
+# https://github.com/docker/buildx/issues/59
+# So, the local registry (push) is used here rather than local build 
(load):
+# 
https://github.com/docker/build-push-action/blob/master/docs/advanced/local-registry.md
+services:
+  registry:
+image: registry:2
+ports:
+  - 5000:5000
 strategy:
   matrix:
 spark_version:
@@ -55,29 +64,26 @@ jobs:
 uses: actions/checkout@v2
 
   - name: Set up QEMU
-uses: docker/setup-qemu-action@v1
+uses: docker/setup-qemu-action@v2
 
   - name: Set up Docker Buildx
-uses: docker/setup-buildx-action@v1
-
-  - name: Login to GHCR
-uses: docker/login-action@v2
+uses: docker/setup-buildx-action@v2
 with:
-  registry: ghcr.io
-  username: ${{ github.actor }}
-  password: ${{ secrets.GITHUB_TOKEN }}
+  # This required by local registry
+  driver-opts: network=host
 
   - name: Generate tags
 run: |
   TAG=scala${{ matrix.scala_version }}-java${{ matrix.java_version 
}}-${{ matrix.image_suffix }}
 
   REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' 
'[:lower:]')
-  TEST_REPO=ghcr.io/$REPO_OWNER/spark-docker
+  TEST_REPO=localhost:5000/$REPO_OWNER/spark-docker
   IMAGE_NAME=spark
   IMAGE_PATH=${{ matrix.spark_version }}/$TAG
   UNIQUE_IMAGE_TAG=${{ matrix.spark_version }}-$TAG
+  IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$UNIQUE_IMAGE_TAG
 
-  # Unique image tag in each version: scala2.12-java11-python3-ubuntu
+  # Unique image tag in each version: 
3.3.0-scala2.12-java11-python3-ubuntu
   echo "UNIQUE_IMAGE_TAG=${UNIQUE_IMAGE_TAG}" >> $GITHUB_ENV
   # Test repo: ghcr.io/apache/spark-docker
   echo "TEST_REPO=${TEST_REPO}" >> $GITHUB_ENV
@@ -85,6 +91,8 @@ jobs:
   echo "IMAGE_NAME=${IMAGE_NAME}" >> $GITHUB_ENV
   # Image dockerfile path: 3.3.0/scala2.12-java11-python3-ubuntu
   echo "IMAGE_PATH=${IMAGE_PATH}" >> $GITHUB_ENV
+  # Image URL: 
ghcr.io/apache/spark-docker/spark:3.3.0-scala2.12-java11-python3-ubuntu
+  echo "IMAGE_URL=${IMAGE_URL}" >> $GITHUB_ENV
 
   - name: Print Image tags
 run: |
@@ -92,13 +100,121 @@ jobs:
   echo "TEST_REPO: "${TEST_REPO}
   echo "IMAGE_NAME: "${IMAGE_NAME}
   echo "IMAGE_PATH: "${IMAGE_PATH}
+  echo "IMAGE_URL: "${IMAGE_URL}
 
   - name: Build and push test image
 uses: 

[spark] branch master updated: [SPARK-40789][PYTHON][TESTS] Separate tests under pyspark.sql.tests

2022-10-13 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4f2630e308c [SPARK-40789][PYTHON][TESTS] Separate tests under 
pyspark.sql.tests
4f2630e308c is described below

commit 4f2630e308caa107d1addedbf94a919a18ab3ed4
Author: Hyukjin Kwon 
AuthorDate: Fri Oct 14 11:13:06 2022 +0900

[SPARK-40789][PYTHON][TESTS] Separate tests under pyspark.sql.tests

### What changes were proposed in this pull request?

This PR proposes to split the tests into the sub-packages:

**Before**

```
tests
├── __init__.py
├── test_arrow.py
├── test_arrow_map.py
├── test_catalog.py
├── test_column.py
├── test_conf.py
├── test_connect_basic.py
├── test_connect_column_expressions.py
├── test_connect_plan_only.py
├── test_connect_select_ops.py
├── test_context.py
├── test_dataframe.py
├── test_datasources.py
├── test_functions.py
├── test_group.py
├── test_pandas_cogrouped_map.py
├── test_pandas_grouped_map.py
├── test_pandas_grouped_map_with_state.py
├── test_pandas_map.py
├── test_pandas_udf.py
├── test_pandas_udf_grouped_agg.py
├── test_pandas_udf_scalar.py
├── test_pandas_udf_typehints.py
├── test_pandas_udf_typehints_with_future_annotations.py
├── test_pandas_udf_window.py
├── test_readwriter.py
├── test_serde.py
├── test_session.py
├── test_streaming.py
├── test_streaming_listener.py
├── test_types.py
├── test_udf.py
├── test_udf_profiler.py
├── test_utils.py
└── typing
├── ...
```

**After**

```
tests
├── __init__.py
├── connect
│   ├── __init__.py
│   ├── test_connect_basic.py
│   ├── test_connect_column_expressions.py
│   ├── test_connect_plan_only.py
│   └── test_connect_select_ops.py
├── pandas
│   ├── __init__.py
│   ├── test_pandas_cogrouped_map.py
│   ├── test_pandas_grouped_map.py
│   ├── test_pandas_grouped_map_with_state.py
│   ├── test_pandas_map.py
│   ├── test_pandas_udf.py
│   ├── test_pandas_udf_grouped_agg.py
│   ├── test_pandas_udf_scalar.py
│   ├── test_pandas_udf_typehints.py
│   ├── test_pandas_udf_typehints_with_future_annotations.py
│   └── test_pandas_udf_window.py
├── streaming
│   ├── __init__.py
│   ├── test_streaming.py
│   └── test_streaming_listener.py
├── test_arrow.py
├── test_arrow_map.py
├── test_catalog.py
├── test_column.py
├── test_conf.py
├── test_context.py
├── test_dataframe.py
├── test_datasources.py
├── test_functions.py
├── test_group.py
├── test_readwriter.py
├── test_serde.py
├── test_session.py
├── test_types.py
├── test_udf.py
├── test_udf_profiler.py
├── test_utils.py
└── typing
├── ...
```

This way is consistent with `pyspark.pandas.tests`.

### Why are the changes needed?

To make it easier to maintain, track and add the tests.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

CI in this PR should test it out.

Closes #38239 from HyukjinKwon/SPARK-40789.

Lead-authored-by: Hyukjin Kwon 
Co-authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
---
 .github/labeler.yml| 10 +++
 dev/sparktestsupport/modules.py| 32 +++---
 python/pyspark/sql/connect/README.md   |  2 +-
 python/pyspark/sql/tests/connect/__init__.py   | 16 +++
 .../sql/tests/{ => connect}/test_connect_basic.py  |  2 +-
 .../test_connect_column_expressions.py |  2 +-
 .../tests/{ => connect}/test_connect_plan_only.py  |  2 +-
 .../tests/{ => connect}/test_connect_select_ops.py |  2 +-
 python/pyspark/sql/tests/pandas/__init__.py| 16 +++
 .../{ => pandas}/test_pandas_cogrouped_map.py  |  2 +-
 .../tests/{ => pandas}/test_pandas_grouped_map.py  |  2 +-
 .../test_pandas_grouped_map_with_state.py  |  2 +-
 .../sql/tests/{ => pandas}/test_pandas_map.py  |  2 +-
 .../sql/tests/{ => pandas}/test_pandas_udf.py  |  2 +-
 .../{ => pandas}/test_pandas_udf_grouped_agg.py|  2 +-
 .../tests/{ => pandas}/test_pandas_udf_scalar.py   |  2 +-
 .../{ => pandas}/test_pandas_udf_typehints.py  |  2 +-
 ...pandas_udf_typehints_with_future_annotations.py |  2 +-
 .../tests/{ => pandas}/test_pandas_udf_window.py   |  2 +-
 python/pyspark/sql/tests/streaming/__init__.py | 16 +++
 .../sql/tests/{ => streaming}/test_streaming.py|  2 +-
 .../{ => streaming}/test_streaming_listener.py |  2 +-
 22 files changed, 86 insertions(+), 38 deletions(-)

diff --git 

[spark] branch master updated: [SPARK-40654][PROTOBUF][FOLLOW-UP] Clean up SBT build in Proto component

2022-10-13 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new c4ef9aec652 [SPARK-40654][PROTOBUF][FOLLOW-UP] Clean up SBT build in 
Proto component
c4ef9aec652 is described below

commit c4ef9aec6525fc54ea5e77eb45e6fe502daf90dd
Author: Hyukjin Kwon 
AuthorDate: Fri Oct 14 11:09:39 2022 +0900

[SPARK-40654][PROTOBUF][FOLLOW-UP] Clean up SBT build in Proto component

### What changes were proposed in this pull request?

This PR cleans up the syntax, and properly copy protobuf assembly jar 
(currently it copies connect assembly jar by mistake).

### Why are the changes needed?

For consistent code style, and correct SBT build.

### Does this PR introduce _any_ user-facing change?
No, this isn't released yet.

### How was this patch tested?

CI in this PR should test it out.

Closes #38240 from HyukjinKwon/SPARK-40654.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
---
 project/SparkBuild.scala | 25 +
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 03970bb862c..0ae35f25e91 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -59,7 +59,7 @@ object BuildCommons {
   ) = Seq(
 "core", "graphx", "mllib", "mllib-local", "repl", "network-common", 
"network-shuffle", "launcher", "unsafe",
 "tags", "sketch", "kvstore"
-  ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects ++ 
Seq(connect) ++ Seq(protobuf)
+  ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects ++ 
Seq(connect)
 
   val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn,
 sparkGangliaLgpl, streamingKinesisAsl,
@@ -433,7 +433,7 @@ object SparkBuild extends PomBuild {
 
   enable(SparkConnect.settings)(connect)
 
-  /* Connector/proto settings */
+  /* Protobuf settings */
   enable(SparkProtobuf.settings)(protobuf)
 
   // SPARK-14738 - Remove docker tests from main Spark build
@@ -703,12 +703,8 @@ object SparkConnect {
 }
 
 object SparkProtobuf {
-
   import BuildCommons.protoVersion
 
-  private val shadePrefix = "org.sparkproject.spark-protobuf"
-  val shadeJar = taskKey[Unit]("Shade the Jars")
-
   lazy val settings = Seq(
 // Setting version for the protobuf compiler. This has to be propagated to 
every sub-project
 // even if the project is not using it.
@@ -716,19 +712,15 @@ object SparkProtobuf {
 
 // For some reason the resolution from the imported Maven build does not 
work for some
 // of these dependendencies that we need to shade later on.
-libraryDependencies ++= Seq(
-  "com.google.protobuf" % "protobuf-java"% protoVersion % 
"protobuf"
-),
+libraryDependencies += "com.google.protobuf" % "protobuf-java" % 
protoVersion % "protobuf",
 
-dependencyOverrides ++= Seq(
-  "com.google.protobuf" % "protobuf-java"% protoVersion
-),
+dependencyOverrides += "com.google.protobuf" % "protobuf-java" % 
protoVersion,
 
 (Compile / PB.targets) := Seq(
-  PB.gens.java-> (Compile / sourceManaged).value,
+  PB.gens.java -> (Compile / sourceManaged).value,
 ),
 
-(assembly / test) := false,
+(assembly / test) := { },
 
 (assembly / logLevel) := Level.Info,
 
@@ -744,6 +736,7 @@ object SparkProtobuf {
 },
   )
 }
+
 object Unsafe {
   lazy val settings = Seq(
 // This option is needed to suppress warnings from sun.misc.Unsafe usage
@@ -1277,7 +1270,7 @@ object CopyDependencies {
   // produce the shaded Jar which happens automatically in the case of 
Maven.
   // Later, when the dependencies are copied, we manually copy the shaded 
Jar only.
   val fid = (LocalProject("connect") / assembly).value
-  val fidProtobuf = (LocalProject("protobuf")/assembly).value
+  val fidProtobuf = (LocalProject("protobuf") / assembly).value
 
   (Compile / dependencyClasspath).value.map(_.data)
 .filter { jar => jar.isFile() }
@@ -1292,7 +1285,7 @@ object CopyDependencies {
 Files.copy(fid.toPath, destJar.toPath)
   } else if (jar.getName.contains("spark-protobuf") &&
 !SbtPomKeys.profiles.value.contains("noshade-protobuf")) {
-Files.copy(fid.toPath, destJar.toPath)
+Files.copy(fidProtobuf.toPath, destJar.toPath)
   } else {
 Files.copy(jar.toPath(), destJar.toPath())
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #420: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


dongjoon-hyun commented on PR #420:
URL: https://github.com/apache/spark-website/pull/420#issuecomment-1278318499

   Thanks!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark-website] branch asf-site updated: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/spark-website.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new b70a0c524  Add GraalSystems in Powered By solutions and companies
b70a0c524 is described below

commit b70a0c524cfab55f0307b3d9cfd652b18756d81a
Author: Vincent Devillers 
AuthorDate: Thu Oct 13 17:27:45 2022 -0500

 Add GraalSystems in Powered By solutions and companies

Author: Vincent Devillers 
Author: Vincent Devillers <>

Closes #420 from Treydone/patch-1.
---
 powered-by.md| 2 +-
 site/powered-by.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/powered-by.md b/powered-by.md
index 9f14fda66..048108882 100644
--- a/powered-by.md
+++ b/powered-by.md
@@ -118,7 +118,7 @@ and external data sources, driving holistic and actionable 
insights.
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
 - https://graal.systems;>GraalSystems
-  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 
+  - GraalSystems is a cloud-native data platform that can be used everywhere, 
on cloud 
   environments or on bare-metal infrastructures.
 - https://www.groupon.com;>Groupon
 - https://www.godatadriven.com;>GoDataDriven
diff --git a/site/powered-by.html b/site/powered-by.html
index 5da8af5b3..42892e605 100644
--- a/site/powered-by.html
+++ b/site/powered-by.html
@@ -295,7 +295,7 @@ activity in real time
   http://www.fundacionctic.org;>Fundacion CTIC
   https://graal.systems;>GraalSystems
 
-  GraalSystems is a cloud-native data platform that can be used 
erverywhere, on cloud 
+  GraalSystems is a cloud-native data platform that can be used 
everywhere, on cloud 
 environments or on bare-metal infrastructures.
 
   


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] srowen closed pull request #420: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


srowen closed pull request #420:  Add GraalSystems in Powered By solutions and 
companies
URL: https://github.com/apache/spark-website/pull/420


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] Treydone commented on a diff in pull request #419: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


Treydone commented on code in PR #419:
URL: https://github.com/apache/spark-website/pull/419#discussion_r995168914


##
powered-by.md:
##
@@ -117,6 +117,9 @@ and external data sources, driving holistic and actionable 
insights.
   - We are using Spark for analyzing and visualizing patterns in large-scale 
recordings of brain 
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
+- https://graal.systems;>GraalSystems
+  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 

Review Comment:
   Oops, sorry for the typo...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40640][CORE] SparkHadoopUtil to set origin of hadoop/hive config options

2022-10-13 Thread dongjoon
This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 4a9350739fa [SPARK-40640][CORE] SparkHadoopUtil to set origin of 
hadoop/hive config options
4a9350739fa is described below

commit 4a9350739fa3aee75932b3b0cf2a8d867db801a4
Author: Steve Loughran 
AuthorDate: Thu Oct 13 12:28:35 2022 -0700

[SPARK-40640][CORE] SparkHadoopUtil to set origin of hadoop/hive config 
options

### What changes were proposed in this pull request?

The options passed from spark conf, hive-site.xml, AWS env vars now all 
record this in their source attribute of the entries.

The Configuration Writable methods do not propagate this, so it is not as 
useful cluster-wide than it could be. It does help with some of the basic 
troubleshooting.

### Why are the changes needed?

Helps when troubleshooting where options make their way down. These can be 
examined
and logged later.

For example, my cloudstore diagnosticss JAR can do this in its storediag 
command
and in an s3a AWS credential provider. I may add some of that logging
at debug to the ASF hadoop implementations.

https://github.com/steveloughran/cloudstore

### Does this PR introduce _any_ user-facing change?

Not *really*. It's a very low level diagnostics feature in the Hadoop 
configuration classes.

### How was this patch tested?

New tests added; existing tests enhanced.

Closes #38084 from steveloughran/SPARK-40640-spark-conf-propagation.

Lead-authored-by: Steve Loughran 
Co-authored-by: Steve Loughran 
Co-authored-by: Dongjoon Hyun 
Signed-off-by: Dongjoon Hyun 
---
 .../org/apache/spark/deploy/SparkHadoopUtil.scala  | 151 +
 .../apache/spark/deploy/SparkHadoopUtilSuite.scala |  95 +++--
 .../spark/sql/hive/client/HiveClientImpl.scala |   5 +-
 3 files changed, 215 insertions(+), 36 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index ad456fb0ee9..8532246dc9e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, 
DataOutputStream, File, IOException}
+import java.net.InetAddress
 import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
 import java.util.{Arrays, Date, Locale}
@@ -415,6 +416,58 @@ private[spark] object SparkHadoopUtil extends Logging {
*/
   private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml"
 
+  /**
+   * Source for hive-site.xml configuration options.
+   */
+  private[deploy] val SOURCE_HIVE_SITE = "Set by Spark from hive-site.xml"
+
+  /**
+   * Source for configuration options set by spark when another source is
+   * not explicitly declared.
+   */
+  private[spark] val SOURCE_SPARK = "Set by Spark"
+
+  /**
+   * Source for configuration options with `spark.hadoop.` prefix copied
+   * from spark-defaults.
+   */
+  private[deploy] val SOURCE_SPARK_HADOOP =
+"Set by Spark from keys starting with 'spark.hadoop'"
+
+  /*
+   * The AWS Authentication environment variables documented in
+   * 
https://docs.aws.amazon.com/sdkref/latest/guide/environment-variables.html.
+   * There are alternative names defined in 
`com.amazonaws.SDKGlobalConfiguration`
+   * and which are picked up by the authentication provider
+   * `EnvironmentVariableCredentialsProvider`; those are not propagated.
+   */
+
+  /**
+   * AWS Access key.
+   */
+  private[deploy] val ENV_VAR_AWS_ACCESS_KEY = "AWS_ACCESS_KEY_ID"
+
+  /**
+   * AWS Secret Key.
+   */
+  private[deploy] val ENV_VAR_AWS_SECRET_KEY = "AWS_SECRET_ACCESS_KEY"
+
+  /**
+   * AWS Session token.
+   */
+  private[deploy] val ENV_VAR_AWS_SESSION_TOKEN = "AWS_SESSION_TOKEN"
+
+  /**
+   * Source for configuration options with `spark.hive.` prefix copied
+   * from spark-defaults.
+   */
+  private[deploy] val SOURCE_SPARK_HIVE = "Set by Spark from keys starting 
with 'spark.hive'"
+
+  /**
+   * Hadoop configuration options set to their default values.
+   */
+  private[deploy] val SET_TO_DEFAULT_VALUES = "Set by Spark to default values"
+
   def get: SparkHadoopUtil = instance
 
   /**
@@ -437,27 +490,52 @@ private[spark] object SparkHadoopUtil extends Logging {
 // Note: this null check is around more than just access to the "conf" 
object to maintain
 // the behavior of the old implementation of this code, for backwards 
compatibility.
 if (conf != null) {
-  // Explicitly check for S3 environment variables
-  val keyId = 

[GitHub] [spark-website] srowen commented on a diff in pull request #419: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


srowen commented on code in PR #419:
URL: https://github.com/apache/spark-website/pull/419#discussion_r995038594


##
powered-by.md:
##
@@ -117,6 +117,9 @@ and external data sources, driving holistic and actionable 
insights.
   - We are using Spark for analyzing and visualizing patterns in large-scale 
recordings of brain 
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
+- https://graal.systems;>GraalSystems
+  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 

Review Comment:
   Oops, missed that - if the OP wants to make another PR I'll merge



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on a diff in pull request #419: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


dongjoon-hyun commented on code in PR #419:
URL: https://github.com/apache/spark-website/pull/419#discussion_r995036542


##
powered-by.md:
##
@@ -117,6 +117,9 @@ and external data sources, driving holistic and actionable 
insights.
   - We are using Spark for analyzing and visualizing patterns in large-scale 
recordings of brain 
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
+- https://graal.systems;>GraalSystems
+  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 

Review Comment:
   `erverywhere` -> `everywhere`? Could you fix the typo, @Treydone ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on a diff in pull request #419: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


dongjoon-hyun commented on code in PR #419:
URL: https://github.com/apache/spark-website/pull/419#discussion_r995036542


##
powered-by.md:
##
@@ -117,6 +117,9 @@ and external data sources, driving holistic and actionable 
insights.
   - We are using Spark for analyzing and visualizing patterns in large-scale 
recordings of brain 
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
+- https://graal.systems;>GraalSystems
+  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 

Review Comment:
   `erverywhere` -> `everywhere`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40551][SQL] DataSource V2: Add APIs for delta-based row-level operations

2022-10-13 Thread dongjoon
This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new b87d5f7750a [SPARK-40551][SQL] DataSource V2: Add APIs for delta-based 
row-level operations
b87d5f7750a is described below

commit b87d5f7750a533acb45b2b75474cdde5dc7d92a0
Author: Anton Okolnychyi 
AuthorDate: Thu Oct 13 11:50:20 2022 -0700

[SPARK-40551][SQL] DataSource V2: Add APIs for delta-based row-level 
operations

### What changes were proposed in this pull request?

This PR adds DS v2 APIs for handling row-level operations for data sources 
that support deltas of rows.

### Why are the changes needed?

These changes are part of the approved SPIP in SPARK-35801.

### Does this PR introduce _any_ user-facing change?

Yes, this PR adds new DS v2 APIs per [design 
doc](https://docs.google.com/document/d/12Ywmc47j3l2WF4anG5vL4qlrhT2OKigb7_EbIKhxg60).

### How was this patch tested?

Tests will be part of the implementation PR.

Closes #38004 from aokolnychyi/spark-40551.

Lead-authored-by: Anton Okolnychyi 
Co-authored-by: aokolnychyi 
Signed-off-by: Dongjoon Hyun 
---
 .../sql/connector/write/DeltaBatchWrite.java}  | 19 ---
 .../spark/sql/connector/write/DeltaWrite.java} | 21 +---
 .../sql/connector/write/DeltaWriteBuilder.java}| 21 +---
 .../spark/sql/connector/write/DeltaWriter.java | 63 ++
 .../sql/connector/write/DeltaWriterFactory.java}   | 22 +---
 .../sql/connector/write/LogicalWriteInfo.java  | 18 +++
 .../spark/sql/connector/write/SupportsDelta.java}  | 26 ++---
 .../sql/connector/write/LogicalWriteInfoImpl.scala |  7 ++-
 8 files changed, 161 insertions(+), 36 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaBatchWrite.java
similarity index 69%
copy from 
sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
copy to 
sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaBatchWrite.java
index b1492e42981..86c48b85dcd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaBatchWrite.java
@@ -15,12 +15,17 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.connector.write
+package org.apache.spark.sql.connector.write;
 
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.annotation.Experimental;
 
-private[sql] case class LogicalWriteInfoImpl(
-queryId: String,
-schema: StructType,
-options: CaseInsensitiveStringMap) extends LogicalWriteInfo
+/**
+ * An interface that defines how to write a delta of rows during batch 
processing.
+ *
+ * @since 3.4.0
+ */
+@Experimental
+public interface DeltaBatchWrite extends BatchWrite {
+  @Override
+  DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info);
+}
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaWrite.java
similarity index 65%
copy from 
sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
copy to 
sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaWrite.java
index b1492e42981..eb230598ef4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaWrite.java
@@ -15,12 +15,19 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.connector.write
+package org.apache.spark.sql.connector.write;
 
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.annotation.Experimental;
 
-private[sql] case class LogicalWriteInfoImpl(
-queryId: String,
-schema: StructType,
-options: CaseInsensitiveStringMap) extends LogicalWriteInfo
+/**
+ * A logical representation of a data source write that handles a delta of 
rows.
+ *
+ * @since 3.4.0
+ */
+@Experimental
+public interface DeltaWrite extends Write {
+  @Override
+  default DeltaBatchWrite toBatch() {
+throw new UnsupportedOperationException(description() + ": Delta batch 
write is not supported");
+  }
+}
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/LogicalWriteInfoImpl.scala
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/DeltaWriteBuilder.java
similarity index 67%
copy from 

[spark-website] branch asf-site updated: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/spark-website.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 0ead62b00 Add GraalSystems in Powered By solutions and companies
0ead62b00 is described below

commit 0ead62b00504d43666dd5105a517093969684a33
Author: Vincent Devillers 
AuthorDate: Thu Oct 13 13:42:05 2022 -0500

Add GraalSystems in Powered By solutions and companies

Author: Vincent Devillers 
Author: Vincent Devillers <>

Closes #419 from Treydone/patch-1.
---
 powered-by.md| 3 +++
 site/powered-by.html | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/powered-by.md b/powered-by.md
index 07d109d7e..9f14fda66 100644
--- a/powered-by.md
+++ b/powered-by.md
@@ -117,6 +117,9 @@ and external data sources, driving holistic and actionable 
insights.
   - We are using Spark for analyzing and visualizing patterns in large-scale 
recordings of brain 
   activity in real time
 - http://www.fundacionctic.org;>Fundacion CTIC
+- https://graal.systems;>GraalSystems
+  - GraalSystems is a cloud-native data platform that can be used erverywhere, 
on cloud 
+  environments or on bare-metal infrastructures.
 - https://www.groupon.com;>Groupon
 - https://www.godatadriven.com;>GoDataDriven
   - Amsterdam based consultancy company helping companies to be successful 
with Spark
diff --git a/site/powered-by.html b/site/powered-by.html
index 6b0466b3a..5da8af5b3 100644
--- a/site/powered-by.html
+++ b/site/powered-by.html
@@ -293,6 +293,12 @@ activity in real time
 
   
   http://www.fundacionctic.org;>Fundacion CTIC
+  https://graal.systems;>GraalSystems
+
+  GraalSystems is a cloud-native data platform that can be used 
erverywhere, on cloud 
+environments or on bare-metal infrastructures.
+
+  
   https://www.groupon.com;>Groupon
   https://www.godatadriven.com;>GoDataDriven
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] srowen closed pull request #419: Add GraalSystems in Powered By solutions and companies

2022-10-13 Thread GitBox


srowen closed pull request #419: Add GraalSystems in Powered By solutions and 
companies
URL: https://github.com/apache/spark-website/pull/419


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (8d6683c8fb0 -> e0c2d6410ff)

2022-10-13 Thread dongjoon
This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 8d6683c8fb0 [SPARK-40663][SQL] Migrate execution errors onto error 
classes: _LEGACY_ERROR_TEMP_2176-2220
 add e0c2d6410ff [SPARK-40733][SQL] Make the contents of `SERDEPROPERTIES` 
in the result of `ShowCreateTableAsSerdeCommand` have a fixed order

No new revisions were added by this update.

Summary of changes:
 .../scala/org/apache/spark/sql/execution/command/tables.scala | 2 +-
 .../spark/sql/hive/execution/command/ShowCreateTableSuite.scala   | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40663][SQL] Migrate execution errors onto error classes: _LEGACY_ERROR_TEMP_2176-2220

2022-10-13 Thread maxgekk
This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 8d6683c8fb0 [SPARK-40663][SQL] Migrate execution errors onto error 
classes: _LEGACY_ERROR_TEMP_2176-2220
8d6683c8fb0 is described below

commit 8d6683c8fb0844b349ea59035d8bebd33f56b77c
Author: itholic 
AuthorDate: Thu Oct 13 18:49:38 2022 +0300

[SPARK-40663][SQL] Migrate execution errors onto error classes: 
_LEGACY_ERROR_TEMP_2176-2220

### What changes were proposed in this pull request?

This PR proposes to migrate 25 execution errors onto temporary error 
classes with the prefix `_LEGACY_ERROR_TEMP_2176` to `_LEGACY_ERROR_TEMP_2200`.

The error classes are prefixed with `_LEGACY_ERROR_TEMP_` indicates the 
dev-facing error messages, and won't be exposed to end users.

### Why are the changes needed?

To speed-up the error class migration.

The migration on temporary error classes allow us to analyze the errors, so 
we can detect the most popular error classes.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

```
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite"
$ build/sbt "test:testOnly *SQLQuerySuite"
$ build/sbt -Phive-thriftserver "hive-thriftserver/testOnly 
org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite"
```

Closes #38169 from itholic/SPARK-40540-2176-2200.

Authored-by: itholic 
Signed-off-by: Max Gekk 
---
 core/src/main/resources/error/error-classes.json   | 140 ++
 .../spark/sql/errors/QueryExecutionErrors.scala| 215 +
 2 files changed, 271 insertions(+), 84 deletions(-)

diff --git a/core/src/main/resources/error/error-classes.json 
b/core/src/main/resources/error/error-classes.json
index 2834dee231a..e535901706e 100644
--- a/core/src/main/resources/error/error-classes.json
+++ b/core/src/main/resources/error/error-classes.json
@@ -3936,5 +3936,145 @@
 "message" : [
   "Rule id not found for "
 ]
+  },
+  "_LEGACY_ERROR_TEMP_2176" : {
+"message" : [
+  "Cannot create array with  elements of data due to 
exceeding the limit  elements for ArrayData. 
"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2177" : {
+"message" : [
+  "Malformed records are detected in record parsing. Parse Mode: 
. To process malformed records as null result, try setting the 
option 'mode' as 'PERMISSIVE'."
+]
+  },
+  "_LEGACY_ERROR_TEMP_2178" : {
+"message" : [
+  "Remote operations not supported"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2179" : {
+"message" : [
+  "HiveServer2 Kerberos principal or keytab is not correctly configured"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2180" : {
+"message" : [
+  "Parent SparkUI to attach this tab to not found!"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2181" : {
+"message" : [
+  "inferSchema is not supported for hive data source."
+]
+  },
+  "_LEGACY_ERROR_TEMP_2182" : {
+"message" : [
+  "Requested partitioning does not match the  table:",
+  "Requested partitions: ",
+  "Table partitions: "
+]
+  },
+  "_LEGACY_ERROR_TEMP_2183" : {
+"message" : [
+  "Dynamic partition key  is not among written partition paths."
+]
+  },
+  "_LEGACY_ERROR_TEMP_2184" : {
+"message" : [
+  "Cannot remove partition directory ''"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2185" : {
+"message" : [
+  "Cannot create staging directory: "
+]
+  },
+  "_LEGACY_ERROR_TEMP_2186" : {
+"message" : [
+  "The SerDe interface removed since Hive 2.3(HIVE-15167). Please migrate 
your custom SerDes to Hive 2.3. See HIVE-15167 for more details."
+]
+  },
+  "_LEGACY_ERROR_TEMP_2187" : {
+"message" : [
+  ", db: , table: "
+]
+  },
+  "_LEGACY_ERROR_TEMP_2188" : {
+"message" : [
+  "Cannot recognize hive type string: , column: "
+]
+  },
+  "_LEGACY_ERROR_TEMP_2189" : {
+"message" : [
+  "Hive 2.2 and lower versions don't support getTablesByType. Please use 
Hive 2.3 or higher version."
+]
+  },
+  "_LEGACY_ERROR_TEMP_2190" : {
+"message" : [
+  "DROP TABLE ... PURGE"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2191" : {
+"message" : [
+  "ALTER TABLE ... DROP PARTITION ... PURGE"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2192" : {
+"message" : [
+  "Partition filter cannot have both `\"` and `'` characters"
+]
+  },
+  "_LEGACY_ERROR_TEMP_2193" : {
+"message" : [
+  "Caught Hive MetaException attempting to get partition metadata by 
filter from Hive. You can set the Spark configuration setting 
 to true to work around this 
problem, however this will result in degraded performance. Please report a bug: 
https://issues.apache.org/jira/browse/SPARK;
+]
+  },
+  

[spark] branch branch-3.3 updated: [SPARK-40782][BUILD] Upgrade `jackson-databind` to 2.13.4.1

2022-10-13 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new 27ca30aaad4 [SPARK-40782][BUILD] Upgrade `jackson-databind` to 2.13.4.1
27ca30aaad4 is described below

commit 27ca30aaad41e4dd50834d255720fb46a36d9e6d
Author: yangjie01 
AuthorDate: Thu Oct 13 10:29:59 2022 -0500

[SPARK-40782][BUILD] Upgrade `jackson-databind` to 2.13.4.1

### What changes were proposed in this pull request?
This pr aims upgrade `jackson-databind` to 2.13.4.1.

### Why are the changes needed?
This is a bug fix version related to  [CVE-2022-42003]

- https://github.com/FasterXML/jackson-databind/pull/3621

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass GitHub Actions

Closes #38235 from LuciferYang/SPARK-40782.

Authored-by: yangjie01 
Signed-off-by: Sean Owen 
(cherry picked from commit 2a8b2a136d5a705526bb76697596f5ad01ce391d)
Signed-off-by: Sean Owen 
---
 dev/deps/spark-deps-hadoop-2-hive-2.3 | 2 +-
 dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 +-
 pom.xml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 
b/dev/deps/spark-deps-hadoop-2-hive-2.3
index fb9c36a26a1..55515614ab8 100644
--- a/dev/deps/spark-deps-hadoop-2-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-2-hive-2.3
@@ -115,7 +115,7 @@ ivy/2.5.0//ivy-2.5.0.jar
 jackson-annotations/2.13.4//jackson-annotations-2.13.4.jar
 jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
 jackson-core/2.13.4//jackson-core-2.13.4.jar
-jackson-databind/2.13.4//jackson-databind-2.13.4.jar
+jackson-databind/2.13.4.1//jackson-databind-2.13.4.1.jar
 jackson-dataformat-cbor/2.13.4//jackson-dataformat-cbor-2.13.4.jar
 jackson-dataformat-yaml/2.13.4//jackson-dataformat-yaml-2.13.4.jar
 jackson-datatype-jsr310/2.13.4//jackson-datatype-jsr310-2.13.4.jar
diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 
b/dev/deps/spark-deps-hadoop-3-hive-2.3
index f6e09eff50a..9fc9dca09b0 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -105,7 +105,7 @@ ivy/2.5.0//ivy-2.5.0.jar
 jackson-annotations/2.13.4//jackson-annotations-2.13.4.jar
 jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
 jackson-core/2.13.4//jackson-core-2.13.4.jar
-jackson-databind/2.13.4//jackson-databind-2.13.4.jar
+jackson-databind/2.13.4.1//jackson-databind-2.13.4.1.jar
 jackson-dataformat-cbor/2.13.4//jackson-dataformat-cbor-2.13.4.jar
 jackson-dataformat-yaml/2.13.4//jackson-dataformat-yaml-2.13.4.jar
 jackson-datatype-jsr310/2.13.4//jackson-datatype-jsr310-2.13.4.jar
diff --git a/pom.xml b/pom.xml
index d7ed56329fd..43f9c30422f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -172,7 +172,7 @@
 true
 1.9.13
 2.13.4
-
2.13.4
+
2.13.4.1
 1.1.8.4
 1.1.2
 2.2.1


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (ac07cea234f -> 2a8b2a136d5)

2022-10-13 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from ac07cea234f [SPARK-40611][SQL] Improve the performance of 
`setInterval` & `getInterval` for `UnsafeRow`
 add 2a8b2a136d5 [SPARK-40782][BUILD] Upgrade `jackson-databind` to 2.13.4.1

No new revisions were added by this update.

Summary of changes:
 dev/deps/spark-deps-hadoop-2-hive-2.3 | 2 +-
 dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 +-
 pom.xml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated (6e0ef86821b -> ac07cea234f)

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 6e0ef86821b [SPARK-40382][SQL] Group distinct aggregate expressions by 
semantically equivalent children in `RewriteDistinctAggregates`
 add ac07cea234f [SPARK-40611][SQL] Improve the performance of 
`setInterval` & `getInterval` for `UnsafeRow`

No new revisions were added by this update.

Summary of changes:
 .../CalendarIntervalBenchmark-jdk11-results.txt}   | 10 +--
 .../CalendarIntervalBenchmark-jdk17-results.txt}   | 10 +--
 .../CalendarIntervalBenchmark-results.txt} | 10 +--
 .../spark/sql/catalyst/expressions/UnsafeRow.java  | 10 +--
 .../spark/sql/CalendarIntervalBenchmark.scala  | 75 ++
 5 files changed, 96 insertions(+), 19 deletions(-)
 copy sql/{core/benchmarks/HashedRelationMetricsBenchmark-jdk17-results.txt => 
catalyst/benchmarks/CalendarIntervalBenchmark-jdk11-results.txt} (53%)
 copy sql/{core/benchmarks/HashedRelationMetricsBenchmark-jdk17-results.txt => 
catalyst/benchmarks/CalendarIntervalBenchmark-jdk17-results.txt} (53%)
 copy sql/{core/benchmarks/HashedRelationMetricsBenchmark-jdk17-results.txt => 
catalyst/benchmarks/CalendarIntervalBenchmark-results.txt} (53%)
 create mode 100644 
sql/catalyst/src/test/scala/org/apache/spark/sql/CalendarIntervalBenchmark.scala


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40382][SQL] Group distinct aggregate expressions by semantically equivalent children in `RewriteDistinctAggregates`

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 6e0ef86821b [SPARK-40382][SQL] Group distinct aggregate expressions by 
semantically equivalent children in `RewriteDistinctAggregates`
6e0ef86821b is described below

commit 6e0ef86821bc25395474fc8f7f41e0ed7bab3f14
Author: Bruce Robbins 
AuthorDate: Thu Oct 13 22:09:22 2022 +0800

[SPARK-40382][SQL] Group distinct aggregate expressions by semantically 
equivalent children in `RewriteDistinctAggregates`

### What changes were proposed in this pull request?

In `RewriteDistinctAggregates`, when grouping aggregate expressions by 
function children, treat children that are semantically equivalent as the same.

### Why are the changes needed?

This PR will reduce the number of projections in the Expand operator when 
there are multiple distinct aggregations with superficially different children. 
In some cases, it will eliminate the need for an Expand operator.

Example: In the following query, the Expand operator creates 3\*n rows 
(where n is the number of incoming rows) because it has a projection for each 
of function children `b + 1`, `1 + b` and `c`.

```
create or replace temp view v1 as
select * from values
(1, 2, 3.0),
(1, 3, 4.0),
(2, 4, 2.5),
(2, 3, 1.0)
v1(a, b, c);

select
  a,
  count(distinct b + 1),
  avg(distinct 1 + b) filter (where c > 0),
  sum(c)
from
  v1
group by a;
```
The Expand operator has three projections (each producing a row for each 
incoming row):
```
[a#87, null, null, 0, null, UnscaledValue(c#89)], <== projection #1 (for 
regular aggregation)
[a#87, (b#88 + 1), null, 1, null, null],  <== projection #2 (for 
distinct aggregation of b + 1)
[a#87, null, (1 + b#88), 2, (c#89 > 0.0), null]], <== projection #3 (for 
distinct aggregation of 1 + b)
```
In reality, the Expand only needs one projection for `1 + b` and `b + 1`, 
because they are semantically equivalent.

With the proposed change, the Expand operator's projections look like this:
```
[a#67, null, 0, null, UnscaledValue(c#69)],  <== projection #1 (for regular 
aggregations)
[a#67, (b#68 + 1), 1, (c#69 > 0.0), null]],  <== projection #2 (for 
distinct aggregation on b + 1 and 1 + b)
```
With one less projection, Expand produces 2\*n rows instead of 3\*n rows, 
but still produces the correct result.

In the case where all distinct aggregates have semantically equivalent 
children, the Expand operator is not needed at all.

Benchmark code in the JIRA (SPARK-40382).

Before the PR:
```
distinct aggregates:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)Rate(M/s)   Per Row(ns)   Relative


all semantically equivalent   14721  14859  
   195  5.7 175.5   1.0X
some semantically equivalent  14569  14572  
 5  5.8 173.7   1.0X
none semantically equivalent  14408  14488  
   113  5.8 171.8   1.0X
```
After the PR:
```
distinct aggregates:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)Rate(M/s)   Per Row(ns)   Relative


all semantically equivalent3658   3692  
49 22.9  43.6   1.0X
some semantically equivalent   9124   9214  
   127  9.2 108.8   0.4X
none semantically equivalent  14601  14777  
   250  5.7 174.1   0.3X
```

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New unit tests.

Closes #37825 from bersprockets/rewritedistinct_issue.

Authored-by: Bruce Robbins 
Signed-off-by: Wenchen Fan 
---
 .../optimizer/RewriteDistinctAggregates.scala  | 10 ---
 .../optimizer/RewriteDistinctAggregatesSuite.scala | 33 +
 .../spark/sql/execution/SparkStrategies.scala  |  6 ++--
 .../spark/sql/execution/aggregate/AggUtils.scala   |  9 --
 .../apache/spark/sql/DataFrameAggregateSuite.scala | 34 ++
 .../apache/spark/sql/execution/PlannerSuite.scala  |  4 +++
 6 files changed, 87 insertions(+), 9 deletions(-)

diff --git 

[spark] branch master updated: [SPARK-39876][FOLLOW-UP][SQL] Add parser and Dataset tests for SQL UNPIVOT

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9bc8c06bc45 [SPARK-39876][FOLLOW-UP][SQL] Add parser and Dataset tests 
for SQL UNPIVOT
9bc8c06bc45 is described below

commit 9bc8c06bc45959a4c4067a1181d074f2325eb4d6
Author: Enrico Minack 
AuthorDate: Thu Oct 13 22:07:27 2022 +0800

[SPARK-39876][FOLLOW-UP][SQL] Add parser and Dataset tests for SQL UNPIVOT

### What changes were proposed in this pull request?
Adds more tests for the SQL `UNPIVOT` clause. 
https://github.com/apache/spark/pull/37407#discussion_r988768918

### Why are the changes needed?
Better test coverage.

### Does this PR introduce _any_ user-facing change?
No, only more tests and fixing one issue. SQL `UNPIVOT` has not been 
released yet.

### How was this patch tested?
In `UnpivotParserSuite` and `DatasetUnpivotSuite`.

Closes #38153 from EnricoMi/branch-sql-unpivot-tests.

Authored-by: Enrico Minack 
Signed-off-by: Wenchen Fan 
---
 .../spark/sql/catalyst/parser/AstBuilder.scala |   6 +-
 .../plans/logical/basicLogicalOperators.scala  |   5 +-
 .../sql/catalyst/parser/UnpivotParserSuite.scala   | 195 +
 .../org/apache/spark/sql/DatasetUnpivotSuite.scala | 124 +
 4 files changed, 327 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index d17c839be11..01ba83d3f84 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1137,7 +1137,8 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] 
with SQLConfHelper wit
   Unpivot(
 None,
 Some(unpivotColumns.map(Seq(_))),
-Some(unpivotAliases),
+// None when all elements are None
+Some(unpivotAliases).filter(_.exists(_.isDefined)),
 variableColumnName,
 valueColumnNames,
 query
@@ -1151,7 +1152,8 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] 
with SQLConfHelper wit
   Unpivot(
 None,
 Some(unpivotColumns),
-Some(unpivotAliases),
+// None when all elements are None
+Some(unpivotAliases).filter(_.exists(_.isDefined)),
 variableColumnName,
 valueColumnNames,
 query
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 4ba869b7320..793fecd5a5b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -1471,7 +1471,10 @@ case class Unpivot(
 copy(child = newChild)
 
   def canBeCoercioned: Boolean = values.exists(_.nonEmpty) &&
-values.exists(_.forall(_.forall(_.resolved)))
+values.exists(_.forall(_.forall(_.resolved))) &&
+// when no ids are given, values must be Attributes (column names) to 
allow detecting ids
+// coercion will add aliases, would disallow detecting ids, so defer 
coercion after id detection
+ids.exists(_.forall(_.resolved))
 
   def valuesTypeCoercioned: Boolean = canBeCoercioned &&
 // all inner values at position idx must have the same data type
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
new file mode 100644
index 000..dd7e4ec4916
--- /dev/null
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.parser
+

[spark] branch master updated: [SPARK-40618][SQL] Fix bug in MergeScalarSubqueries rule with nested subqueries using reference tracking

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 9287fc9dd73 [SPARK-40618][SQL] Fix bug in MergeScalarSubqueries rule 
with nested subqueries using reference tracking
9287fc9dd73 is described below

commit 9287fc9dd73a0909d5705308532b24528b3f1090
Author: Peter Toth 
AuthorDate: Thu Oct 13 22:06:15 2022 +0800

[SPARK-40618][SQL] Fix bug in MergeScalarSubqueries rule with nested 
subqueries using reference tracking

### What changes were proposed in this pull request?
This PR reverts the previous fix https://github.com/apache/spark/pull/38052 
and adds subquery reference tracking to `MergeScalarSubqueries` to restore 
previous functionality of merging independent nested subqueries.

### Why are the changes needed?
Restore previous functionality but fix the bug discovered in 
https://issues.apache.org/jira/browse/SPARK-40618.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing and new UTs.

Closes #38093 from peter-toth/SPARK-40618-fix-mergescalarsubqueries.

Authored-by: Peter Toth 
Signed-off-by: Wenchen Fan 
---
 .../catalyst/optimizer/MergeScalarSubqueries.scala | 62 +-
 .../scala/org/apache/spark/sql/SubquerySuite.scala | 35 +---
 2 files changed, 67 insertions(+), 30 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala
index 69f77e8f3f4..1cb3f3f157c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/MergeScalarSubqueries.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.expressions._
@@ -126,8 +127,14 @@ object MergeScalarSubqueries extends Rule[LogicalPlan] {
*   merged as there can be subqueries that are different 
([[checkIdenticalPlans]] is
*   false) due to an extra [[Project]] node in one of them. In 
that case
*   `attributes.size` remains 1 after merging, but the merged 
flag becomes true.
+   * @param references A set of subquery indexes in the cache to track all 
(including transitive)
+   *   nested subqueries.
*/
-  case class Header(attributes: Seq[Attribute], plan: LogicalPlan, merged: 
Boolean)
+  case class Header(
+  attributes: Seq[Attribute],
+  plan: LogicalPlan,
+  merged: Boolean,
+  references: Set[Int])
 
   private def extractCommonScalarSubqueries(plan: LogicalPlan) = {
 val cache = ArrayBuffer.empty[Header]
@@ -166,26 +173,39 @@ object MergeScalarSubqueries extends Rule[LogicalPlan] {
   // "Header".
   private def cacheSubquery(plan: LogicalPlan, cache: ArrayBuffer[Header]): 
(Int, Int) = {
 val output = plan.output.head
-cache.zipWithIndex.collectFirst(Function.unlift { case (header, 
subqueryIndex) =>
-  checkIdenticalPlans(plan, header.plan).map { outputMap =>
-val mappedOutput = mapAttributes(output, outputMap)
-val headerIndex = header.attributes.indexWhere(_.exprId == 
mappedOutput.exprId)
-subqueryIndex -> headerIndex
-  }.orElse(tryMergePlans(plan, header.plan).map {
-case (mergedPlan, outputMap) =>
+val references = mutable.HashSet.empty[Int]
+
plan.transformAllExpressionsWithPruning(_.containsAnyPattern(SCALAR_SUBQUERY_REFERENCE))
 {
+  case ssr: ScalarSubqueryReference =>
+references += ssr.subqueryIndex
+references ++= cache(ssr.subqueryIndex).references
+ssr
+}
+
+cache.zipWithIndex.collectFirst(Function.unlift {
+  case (header, subqueryIndex) if !references.contains(subqueryIndex) =>
+checkIdenticalPlans(plan, header.plan).map { outputMap =>
   val mappedOutput = mapAttributes(output, outputMap)
-  var headerIndex = header.attributes.indexWhere(_.exprId == 
mappedOutput.exprId)
-  val newHeaderAttributes = if (headerIndex == -1) {
-headerIndex = header.attributes.size
-header.attributes :+ mappedOutput
-  } else {
-header.attributes
-  }
-  cache(subqueryIndex) = Header(newHeaderAttributes, mergedPlan, true)
+  val headerIndex = header.attributes.indexWhere(_.exprId == 
mappedOutput.exprId)
   subqueryIndex -> headerIndex
-  })
+}.orElse{
+  tryMergePlans(plan, header.plan).map {
+case (mergedPlan, outputMap) =>
+  val mappedOutput = 

[spark] branch master updated (d9c90887cb9 -> 069967ba42c)

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from d9c90887cb9 [SPARK-40780][CONNECT] Add WHERE to Connect proto and DSL
 add 069967ba42c [SPARK-40773][SQL] Refactor checkCorrelationsInSubquery

No new revisions were added by this update.

Summary of changes:
 .../sql/catalyst/analysis/CheckAnalysis.scala  | 217 +++--
 .../sql-tests/results/udf/udf-except.sql.out   |   2 +-
 .../scala/org/apache/spark/sql/SubquerySuite.scala |  20 +-
 3 files changed, 129 insertions(+), 110 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark-docker] branch master updated: [SPARK-40754][DOCS] Add LICENSE and NOTICE

2022-10-13 Thread yikun
This is an automated email from the ASF dual-hosted git repository.

yikun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark-docker.git


The following commit(s) were added to refs/heads/master by this push:
 new fc07aed  [SPARK-40754][DOCS] Add LICENSE and NOTICE
fc07aed is described below

commit fc07aeda1f48eb2aae9a441dfe94ae95f697e222
Author: Yikun Jiang 
AuthorDate: Thu Oct 13 21:47:15 2022 +0800

[SPARK-40754][DOCS] Add LICENSE and NOTICE

### What changes were proposed in this pull request?
This pach adds LICENSE and NOTICE:
- LICENSE: https://www.apache.org/licenses/LICENSE-2.0.txt
- NOTICE: https://github.com/apache/spark/blob/master/NOTICE

### Why are the changes needed?
https://www.apache.org/licenses/LICENSE-2.0#apply

See also: 
https://github.com/apache/spark-docker/pull/2#issuecomment-1274807917

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
No need

Closes #6 from Yikun/SPARK-40754.

Authored-by: Yikun Jiang 
Signed-off-by: Yikun Jiang 
---
 LICENSE | 202 
 NOTICE  |   6 ++
 2 files changed, 208 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+   Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Work.
+
+   2. Grant of Copyright 

[spark] branch master updated: [SPARK-40780][CONNECT] Add WHERE to Connect proto and DSL

2022-10-13 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d9c90887cb9 [SPARK-40780][CONNECT] Add WHERE to Connect proto and DSL
d9c90887cb9 is described below

commit d9c90887cb9ef32d54b3e0edcfffb43ba3d70fa6
Author: Rui Wang 
AuthorDate: Thu Oct 13 21:22:21 2022 +0800

[SPARK-40780][CONNECT] Add WHERE to Connect proto and DSL

### What changes were proposed in this pull request?

Add WHERE to Connect proto and DSL.

### Why are the changes needed?

Improve Connect proto testing coverage.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

UT

Closes #38232 from amaliujia/add_filter_to_dsl.

Authored-by: Rui Wang 
Signed-off-by: Wenchen Fan 
---
 .../org/apache/spark/sql/connect/dsl/package.scala | 22 ++
 .../connect/planner/SparkConnectProtoSuite.scala   | 11 +++
 .../spark/sql/catalyst/analysis/Analyzer.scala |  2 +-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git 
a/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
 
b/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
index 80d6e77c9fc..0db8ab96610 100644
--- 
a/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
+++ 
b/connector/connect/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.connect
 
 import scala.collection.JavaConverters._
+import scala.language.implicitConversions
 
 import org.apache.spark.connect.proto
 import org.apache.spark.connect.proto.Join.JoinType
@@ -44,7 +45,20 @@ package object dsl {
 implicit class DslExpression(val expr: proto.Expression) {
   def as(alias: String): proto.Expression = 
proto.Expression.newBuilder().setAlias(
 
proto.Expression.Alias.newBuilder().setName(alias).setExpr(expr)).build()
+
+  def < (other: proto.Expression): proto.Expression =
+proto.Expression.newBuilder().setUnresolvedFunction(
+  proto.Expression.UnresolvedFunction.newBuilder()
+.addParts("<")
+.addArguments(expr)
+.addArguments(other)
+).build()
 }
+
+implicit def intToLiteral(i: Int): proto.Expression =
+  proto.Expression.newBuilder().setLiteral(
+proto.Expression.Literal.newBuilder().setI32(i)
+  ).build()
   }
 
   object plans { // scalastyle:ignore
@@ -58,6 +72,14 @@ package object dsl {
 ).build()
   }
 
+  def where(condition: proto.Expression): proto.Relation = {
+proto.Relation.newBuilder()
+  .setFilter(
+
proto.Filter.newBuilder().setInput(logicalPlan).setCondition(condition)
+).build()
+  }
+
+
   def join(
   otherPlan: proto.Relation,
   joinType: JoinType = JoinType.JOIN_TYPE_INNER,
diff --git 
a/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
 
b/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
index 510b54cd250..351cc70852a 100644
--- 
a/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
+++ 
b/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
@@ -51,6 +51,17 @@ class SparkConnectProtoSuite extends PlanTest with 
SparkConnectPlanTest {
 comparePlans(connectPlan.analyze, sparkPlan.analyze, false)
   }
 
+  test("Basic filter") {
+val connectPlan = {
+  import org.apache.spark.sql.connect.dsl.expressions._
+  import org.apache.spark.sql.connect.dsl.plans._
+  transform(connectTestRelation.where("id".protoAttr < 0))
+}
+
+val sparkPlan = sparkTestRelation.where($"id" < 0).analyze
+comparePlans(connectPlan.analyze, sparkPlan.analyze, false)
+  }
+
   test("Basic joins with different join types") {
 val connectPlan = {
   import org.apache.spark.sql.connect.dsl.plans._
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 62d930dcd20..ae65902e8a6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -70,7 +70,7 @@ object SimpleAnalyzer extends Analyzer(
 FakeV2SessionCatalog,
 new SessionCatalog(
   new InMemoryCatalog,
-  EmptyFunctionRegistry,
+  FunctionRegistry.builtin,
   EmptyTableFunctionRegistry) {
   override def createDatabase(dbDefinition: CatalogDatabase, 
ignoreIfExists: Boolean): Unit = {}
 })) {



[spark] branch master updated: [MINOR][DOCS][CONNECT] Fix docs to run Spark Connect locally built

2022-10-13 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 1cb8250d166 [MINOR][DOCS][CONNECT] Fix docs to run Spark Connect 
locally built
1cb8250d166 is described below

commit 1cb8250d166f5d877f24f6ea097d27d7168ecf15
Author: Hyukjin Kwon 
AuthorDate: Thu Oct 13 16:39:26 2022 +0900

[MINOR][DOCS][CONNECT] Fix docs to run Spark Connect locally built

### What changes were proposed in this pull request?

This PR adds some more command examples to run Spark Connect that you built 
locally.

### Why are the changes needed?

To guide developers to run Spark Connect they built locally.

### Does this PR introduce _any_ user-facing change?

No, dev-only and doc-only.

### How was this patch tested?

The commands were tested manually in my local.

Closes #38236 from HyukjinKwon/minor-docs-spark-cnnect.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/connect/README.md | 36 
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/sql/connect/README.md 
b/python/pyspark/sql/connect/README.md
index bd42142ac78..b9cfb31d13c 100644
--- a/python/pyspark/sql/connect/README.md
+++ b/python/pyspark/sql/connect/README.md
@@ -9,22 +9,34 @@ of Spark. To enable it, you only need to activate the driver 
plugin for Spark Co
 
 ## Build
 
-1. Build Spark as usual per the documentation.
-
-2. Build and package the Spark Connect package
-
-   ```bash
-   ./build/mvn -Phive package
-   ```
+```bash
+./build/mvn -Phive clean package
+```
 
-   or
+or
 
-   ```bash
-   ./build/sbt -Phive package
-   ```
+```bash
+./build/sbt -Phive clean package
+```

 ## Run Spark Shell
 
+To run Spark Connect you locally built:
+
+```bash
+# Scala shell
+./bin/spark-shell \
+  --jars `ls connector/connect/target/**/spark-connect*SNAPSHOT.jar | paste 
-sd ',' -` \
+  --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin
+
+# PySpark shell
+./bin/pyspark \
+  --jars `ls connector/connect/target/**/spark-connect*SNAPSHOT.jar | paste 
-sd ',' -` \
+  --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin
+```
+
+To use the release version of Spark Connect:
+
 ```bash
 ./bin/spark-shell \
   --packages org.apache.spark:spark-connect_2.12:3.4.0 \
@@ -34,6 +46,6 @@ of Spark. To enable it, you only need to activate the driver 
plugin for Spark Co
 ## Run Tests
 
 ```bash
-./run-tests --testnames 'pyspark.sql.tests.test_connect_basic'
+./python/run-tests --testnames 'pyspark.sql.tests.test_connect_basic'
 ```
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-40601][PYTHON] Assert identical key size when cogrouping groups

2022-10-13 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new d9dd9944bf4 [SPARK-40601][PYTHON] Assert identical key size when 
cogrouping groups
d9dd9944bf4 is described below

commit d9dd9944bf4c3adba4bcb458304793376f083000
Author: Enrico Minack 
AuthorDate: Thu Oct 13 15:20:31 2022 +0900

[SPARK-40601][PYTHON] Assert identical key size when cogrouping groups

Cogrouping two grouped DataFrames in PySpark that have different group key 
cardinalities raises an error that is not very descriptive:

```python
left.groupby("id", "k")
.cogroup(right.groupby("id"))
```

```
py4j.protocol.Py4JJavaError: An error occurred while calling 
o726.collectToPython.
: java.lang.IndexOutOfBoundsException: 1
at 
scala.collection.mutable.ResizableArray.apply(ResizableArray.scala:46)
at 
scala.collection.mutable.ResizableArray.apply$(ResizableArray.scala:45)
at scala.collection.mutable.ArrayBuffer.apply(ArrayBuffer.scala:49)
at 
org.apache.spark.sql.catalyst.plans.physical.HashShuffleSpec.$anonfun$createPartitioning$5(partitioning.scala:650)
...

org.apache.spark.sql.execution.exchange.EnsureRequirements.$anonfun$ensureDistributionAndOrdering$14(EnsureRequirements.scala:159)
```

### What changes were proposed in this pull request?
Assert identical size of groupby keys and provide a meaningful error on 
cogroup.

### Why are the changes needed?
The error does not provide information on how to solve the problem.

### Does this PR introduce _any_ user-facing change?
Yes, raises an `AssertionError: group keys must have same size` instead.

### How was this patch tested?
Adds test `test_different_group_key_cardinality` to 
`pyspark.sql.tests.test_pandas_cogrouped_map`.

Closes #38036 from EnricoMi/branch-cogroup-key-mismatch.

Authored-by: Enrico Minack 
Signed-off-by: Hyukjin Kwon 
---
 .../pyspark/sql/tests/test_pandas_cogrouped_map.py | 41 +-
 .../spark/sql/RelationalGroupedDataset.scala   |  3 ++
 .../org/apache/spark/sql/DataFrameSuite.scala  | 32 -
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py 
b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
index 3f403d9c9d6..88ba396e3f5 100644
--- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
@@ -20,7 +20,7 @@ from typing import cast
 
 from pyspark.sql.functions import array, explode, col, lit, udf, pandas_udf
 from pyspark.sql.types import DoubleType, StructType, StructField, Row
-from pyspark.sql.utils import PythonException
+from pyspark.sql.utils import IllegalArgumentException, PythonException
 from pyspark.testing.sqlutils import (
 ReusedSQLTestCase,
 have_pandas,
@@ -80,6 +80,29 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
 right = self.data2.withColumn("v3", lit("a"))
 self._test_merge(self.data1, right, "id long, k int, v int, v2 int, v3 
string")
 
+def test_different_keys(self):
+left = self.data1
+right = self.data2
+
+def merge_pandas(lft, rgt):
+return pd.merge(lft.rename(columns={"id2": "id"}), rgt, on=["id", 
"k"])
+
+result = (
+left.withColumnRenamed("id", "id2")
+.groupby("id2")
+.cogroup(right.groupby("id"))
+.applyInPandas(merge_pandas, "id long, k int, v int, v2 int")
+.sort(["id", "k"])
+.toPandas()
+)
+
+left = left.toPandas()
+right = right.toPandas()
+
+expected = pd.merge(left, right, on=["id", "k"]).sort_values(by=["id", 
"k"])
+
+assert_frame_equal(expected, result)
+
 def test_complex_group_by(self):
 left = pd.DataFrame.from_dict({"id": [1, 2, 3], "k": [5, 6, 7], "v": 
[9, 10, 11]})
 
@@ -125,6 +148,22 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
 
 assert_frame_equal(expected, result)
 
+def test_different_group_key_cardinality(self):
+left = self.data1
+right = self.data2
+
+def merge_pandas(lft, _):
+return lft
+
+with QuietTest(self.sc):
+with self.assertRaisesRegex(
+IllegalArgumentException,
+"requirement failed: Cogroup keys must have same size: 2 != 1",
+):
+(left.groupby("id", 
"k").cogroup(right.groupby("id"))).applyInPandas(
+merge_pandas, "id long, k int, v int"
+)
+
 def test_apply_in_pandas_not_returning_pandas_dataframe(self):
 left = self.data1
 right = self.data2
diff --git