This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e2176338c9b [SPARK-40665][CONNECT] Avoid embedding Spark Connect in the Apache Spark binary release e2176338c9b is described below commit e2176338c9b4020b9d5dcd831038d350ce03137f Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Thu Oct 6 12:27:15 2022 +0900 [SPARK-40665][CONNECT] Avoid embedding Spark Connect in the Apache Spark binary release ### What changes were proposed in this pull request? This PR proposes 1. Move `connect` to `connector/connect` to be consistent with Kafka and Avro. 2. Do not include this in the default Apache Spark release binary. 3. Fix the module dependency in `modules.py`. 4. Fix the usages in `README.md` with cleaning up. 5. Cleanup PySpark test structure to be consistent with other PySpark tests. ### Why are the changes needed? To make it consistent with Avro or Kafka, see also https://github.com/apache/spark/pull/37710/files#r978291019 ### Does this PR introduce _any_ user-facing change? No, this isn't released yet. The usage of this project would be changed from: ```bash ./bin/spark-shell \ --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin ``` to ```bash ./bin/spark-shell \ --packages org.apache.spark:spark-connect_2.12:3.4.0 \ --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin ``` ### How was this patch tested? CI in the PR should verify this. Closes #38109 from HyukjinKwon/SPARK-40665. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .github/labeler.yml | 2 +- .github/workflows/build_and_test.yml | 2 +- assembly/pom.xml | 5 ----- {connect => connector/connect}/.gitignore | 0 .../connect}/dev/generate_protos.sh | 4 ++-- {connect => connector/connect}/pom.xml | 2 +- .../connect}/src/main/buf.gen.yaml | 0 .../connect}/src/main/buf.work.yaml | 0 .../connect}/src/main/protobuf/buf.yaml | 0 .../src/main/protobuf/spark/connect/base.proto | 0 .../src/main/protobuf/spark/connect/commands.proto | 0 .../main/protobuf/spark/connect/expressions.proto | 0 .../main/protobuf/spark/connect/relations.proto | 0 .../src/main/protobuf/spark/connect/types.proto | 0 .../spark/sql/connect/SparkConnectPlugin.scala | 0 .../command/SparkConnectCommandPlanner.scala | 0 .../apache/spark/sql/connect/config/Connect.scala | 0 .../sql/connect/planner/SparkConnectPlanner.scala | 0 .../sql/connect/service/SparkConnectService.scala | 0 .../service/SparkConnectStreamHandler.scala | 0 .../connect}/src/test/resources/log4j2.properties | 0 .../connect/planner/SparkConnectPlannerSuite.scala | 0 dev/deps/spark-deps-hadoop-2-hive-2.3 | 16 -------------- dev/deps/spark-deps-hadoop-3-hive-2.3 | 16 -------------- dev/sparktestsupport/modules.py | 25 ++++++++++++++++------ dev/sparktestsupport/utils.py | 20 ++++++++--------- pom.xml | 2 +- python/mypy.ini | 9 +++----- python/pyspark/sql/connect/README.md | 17 ++++++++------- python/pyspark/sql/tests/connect/__init__.py | 16 -------------- python/pyspark/sql/tests/connect/utils/__init__.py | 20 ----------------- ...test_spark_connect.py => test_connect_basic.py} | 8 +++++-- ...sions.py => test_connect_column_expressions.py} | 5 ++--- ...test_plan_only.py => test_connect_plan_only.py} | 4 ++-- ...st_select_ops.py => test_connect_select_ops.py} | 4 ++-- .../connectutils.py} | 21 ++++++++++++++++++ python/run-tests.py | 12 +---------- 37 files changed, 80 insertions(+), 130 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 0d04244f882..cf1d2a71172 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -152,6 +152,6 @@ WEB UI: DEPLOY: - "sbin/**/*" CONNECT: - - "connect/**/*" + - "connector/connect/**/*" - "**/sql/sparkconnect/**/*" - "python/pyspark/sql/**/connect/**/*" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b0847187dff..b7f8b10c00f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -334,7 +334,7 @@ jobs: - >- pyspark-pandas-slow - >- - pyspark-sql-connect + pyspark-connect env: MODULES_TO_TEST: ${{ matrix.modules }} HADOOP_PROFILE: ${{ inputs.hadoop }} diff --git a/assembly/pom.xml b/assembly/pom.xml index 218bf367950..f37edcd7e49 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -74,11 +74,6 @@ <artifactId>spark-repl_${scala.binary.version}</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-connect_${scala.binary.version}</artifactId> - <version>${project.version}</version> - </dependency> <!-- Because we don't shade dependencies anymore, we need to restore Guava to compile scope so diff --git a/connect/.gitignore b/connector/connect/.gitignore similarity index 100% rename from connect/.gitignore rename to connector/connect/.gitignore diff --git a/connect/dev/generate_protos.sh b/connector/connect/dev/generate_protos.sh similarity index 98% rename from connect/dev/generate_protos.sh rename to connector/connect/dev/generate_protos.sh index 39d20d3acc8..0eb76b6cea7 100755 --- a/connect/dev/generate_protos.sh +++ b/connector/connect/dev/generate_protos.sh @@ -19,7 +19,7 @@ set -ex SPARK_HOME="$(cd "`dirname $0`"/../..; pwd)" cd "$SPARK_HOME" -pushd connect/src/main +pushd connector/connect/src/main LICENSE=$(cat <<'EOF' # @@ -79,4 +79,4 @@ for f in `find gen/proto/python -name "*.py*"`; do done # Clean up everything. -rm -Rf gen \ No newline at end of file +rm -Rf gen diff --git a/connect/pom.xml b/connector/connect/pom.xml similarity index 99% rename from connect/pom.xml rename to connector/connect/pom.xml index 56930210f8e..5a5440842a9 100644 --- a/connect/pom.xml +++ b/connector/connect/pom.xml @@ -23,7 +23,7 @@ <groupId>org.apache.spark</groupId> <artifactId>spark-parent_2.12</artifactId> <version>3.4.0-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> + <relativePath>../../pom.xml</relativePath> </parent> <artifactId>spark-connect_2.12</artifactId> diff --git a/connect/src/main/buf.gen.yaml b/connector/connect/src/main/buf.gen.yaml similarity index 100% rename from connect/src/main/buf.gen.yaml rename to connector/connect/src/main/buf.gen.yaml diff --git a/connect/src/main/buf.work.yaml b/connector/connect/src/main/buf.work.yaml similarity index 100% rename from connect/src/main/buf.work.yaml rename to connector/connect/src/main/buf.work.yaml diff --git a/connect/src/main/protobuf/buf.yaml b/connector/connect/src/main/protobuf/buf.yaml similarity index 100% rename from connect/src/main/protobuf/buf.yaml rename to connector/connect/src/main/protobuf/buf.yaml diff --git a/connect/src/main/protobuf/spark/connect/base.proto b/connector/connect/src/main/protobuf/spark/connect/base.proto similarity index 100% rename from connect/src/main/protobuf/spark/connect/base.proto rename to connector/connect/src/main/protobuf/spark/connect/base.proto diff --git a/connect/src/main/protobuf/spark/connect/commands.proto b/connector/connect/src/main/protobuf/spark/connect/commands.proto similarity index 100% rename from connect/src/main/protobuf/spark/connect/commands.proto rename to connector/connect/src/main/protobuf/spark/connect/commands.proto diff --git a/connect/src/main/protobuf/spark/connect/expressions.proto b/connector/connect/src/main/protobuf/spark/connect/expressions.proto similarity index 100% rename from connect/src/main/protobuf/spark/connect/expressions.proto rename to connector/connect/src/main/protobuf/spark/connect/expressions.proto diff --git a/connect/src/main/protobuf/spark/connect/relations.proto b/connector/connect/src/main/protobuf/spark/connect/relations.proto similarity index 100% rename from connect/src/main/protobuf/spark/connect/relations.proto rename to connector/connect/src/main/protobuf/spark/connect/relations.proto diff --git a/connect/src/main/protobuf/spark/connect/types.proto b/connector/connect/src/main/protobuf/spark/connect/types.proto similarity index 100% rename from connect/src/main/protobuf/spark/connect/types.proto rename to connector/connect/src/main/protobuf/spark/connect/types.proto diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/command/SparkConnectCommandPlanner.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/command/SparkConnectCommandPlanner.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/command/SparkConnectCommandPlanner.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/command/SparkConnectCommandPlanner.scala diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala diff --git a/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala b/connector/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala similarity index 100% rename from connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala rename to connector/connect/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala diff --git a/connect/src/test/resources/log4j2.properties b/connector/connect/src/test/resources/log4j2.properties similarity index 100% rename from connect/src/test/resources/log4j2.properties rename to connector/connect/src/test/resources/log4j2.properties diff --git a/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala b/connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala similarity index 100% rename from connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala rename to connector/connect/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3 index 6f35033c31d..5d8a79067ef 100644 --- a/dev/deps/spark-deps-hadoop-2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2-hive-2.3 @@ -6,9 +6,7 @@ ST4/4.0.4//ST4-4.0.4.jar activation/1.1.1//activation-1.1.1.jar aircompressor/0.21//aircompressor-0.21.jar algebra_2.12/2.0.1//algebra_2.12-2.0.1.jar -animal-sniffer-annotations/1.19//animal-sniffer-annotations-1.19.jar annotations/17.0.0//annotations-17.0.0.jar -annotations/4.1.1.4//annotations-4.1.1.4.jar antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar antlr4-runtime/4.9.3//antlr4-runtime-4.9.3.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar @@ -64,19 +62,9 @@ datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar derby/10.14.2.0//derby-10.14.2.0.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar -error_prone_annotations/2.10.0//error_prone_annotations-2.10.0.jar -failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/1.12.0//flatbuffers-java-1.12.0.jar gcs-connector/hadoop2-2.2.7/shaded/gcs-connector-hadoop2-2.2.7-shaded.jar gmetric4j/1.0.10//gmetric4j-1.0.10.jar -grpc-api/1.47.0//grpc-api-1.47.0.jar -grpc-context/1.47.0//grpc-context-1.47.0.jar -grpc-core/1.47.0//grpc-core-1.47.0.jar -grpc-netty-shaded/1.47.0//grpc-netty-shaded-1.47.0.jar -grpc-protobuf-lite/1.47.0//grpc-protobuf-lite-1.47.0.jar -grpc-protobuf/1.47.0//grpc-protobuf-1.47.0.jar -grpc-services/1.47.0//grpc-services-1.47.0.jar -grpc-stub/1.47.0//grpc-stub-1.47.0.jar gson/2.2.4//gson-2.2.4.jar guava/14.0.1//guava-14.0.1.jar guice-servlet/3.0//guice-servlet-3.0.jar @@ -122,7 +110,6 @@ httpclient/4.5.13//httpclient-4.5.13.jar httpcore/4.4.14//httpcore-4.4.14.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.0//ivy-2.5.0.jar -j2objc-annotations/1.3//j2objc-annotations-1.3.jar jackson-annotations/2.13.4//jackson-annotations-2.13.4.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar jackson-core/2.13.4//jackson-core-2.13.4.jar @@ -245,10 +232,7 @@ parquet-encoding/1.12.3//parquet-encoding-1.12.3.jar parquet-format-structures/1.12.3//parquet-format-structures-1.12.3.jar parquet-hadoop/1.12.3//parquet-hadoop-1.12.3.jar parquet-jackson/1.12.3//parquet-jackson-1.12.3.jar -perfmark-api/0.25.0//perfmark-api-0.25.0.jar pickle/1.2//pickle-1.2.jar -proto-google-common-protos/2.0.1//proto-google-common-protos-2.0.1.jar -protobuf-java-util/3.19.2//protobuf-java-util-3.19.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.7//py4j-0.10.9.7.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index de6a3fe5f30..9826d4a90f8 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -10,9 +10,7 @@ aliyun-java-sdk-core/4.5.10//aliyun-java-sdk-core-4.5.10.jar aliyun-java-sdk-kms/2.11.0//aliyun-java-sdk-kms-2.11.0.jar aliyun-java-sdk-ram/3.1.0//aliyun-java-sdk-ram-3.1.0.jar aliyun-sdk-oss/3.13.0//aliyun-sdk-oss-3.13.0.jar -animal-sniffer-annotations/1.19//animal-sniffer-annotations-1.19.jar annotations/17.0.0//annotations-17.0.0.jar -annotations/4.1.1.4//annotations-4.1.1.4.jar antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar antlr4-runtime/4.9.3//antlr4-runtime-4.9.3.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar @@ -61,19 +59,9 @@ datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar derby/10.14.2.0//derby-10.14.2.0.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar -error_prone_annotations/2.10.0//error_prone_annotations-2.10.0.jar -failureaccess/1.0.1//failureaccess-1.0.1.jar flatbuffers-java/1.12.0//flatbuffers-java-1.12.0.jar gcs-connector/hadoop3-2.2.7/shaded/gcs-connector-hadoop3-2.2.7-shaded.jar gmetric4j/1.0.10//gmetric4j-1.0.10.jar -grpc-api/1.47.0//grpc-api-1.47.0.jar -grpc-context/1.47.0//grpc-context-1.47.0.jar -grpc-core/1.47.0//grpc-core-1.47.0.jar -grpc-netty-shaded/1.47.0//grpc-netty-shaded-1.47.0.jar -grpc-protobuf-lite/1.47.0//grpc-protobuf-lite-1.47.0.jar -grpc-protobuf/1.47.0//grpc-protobuf-1.47.0.jar -grpc-services/1.47.0//grpc-services-1.47.0.jar -grpc-stub/1.47.0//grpc-stub-1.47.0.jar gson/2.2.4//gson-2.2.4.jar guava/14.0.1//guava-14.0.1.jar hadoop-aliyun/3.3.4//hadoop-aliyun-3.3.4.jar @@ -110,7 +98,6 @@ httpcore/4.4.14//httpcore-4.4.14.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.0//ivy-2.5.0.jar -j2objc-annotations/1.3//j2objc-annotations-1.3.jar jackson-annotations/2.13.4//jackson-annotations-2.13.4.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar jackson-core/2.13.4//jackson-core-2.13.4.jar @@ -232,10 +219,7 @@ parquet-encoding/1.12.3//parquet-encoding-1.12.3.jar parquet-format-structures/1.12.3//parquet-format-structures-1.12.3.jar parquet-hadoop/1.12.3//parquet-hadoop-1.12.3.jar parquet-jackson/1.12.3//parquet-jackson-1.12.3.jar -perfmark-api/0.25.0//perfmark-api-0.25.0.jar pickle/1.2//pickle-1.2.jar -proto-google-common-protos/2.0.1//proto-google-common-protos-2.0.1.jar -protobuf-java-util/3.19.2//protobuf-java-util-3.19.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.7//py4j-0.10.9.7.jar remotetea-oncrpc/1.1.2//remotetea-oncrpc-1.1.2.jar diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index f4753f6ddc7..98eb7c0fb1e 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -271,6 +271,17 @@ sql_kafka = Module( ], ) +connect = Module( + name="connect", + dependencies=[sql], + source_file_regexes=[ + "connector/connect", + ], + sbt_test_goals=[ + "connect/test", + ], +) + sketch = Module( name="sketch", dependencies=[tags], @@ -473,18 +484,18 @@ pyspark_sql = Module( ], ) -pyspark_sql = Module( - name="pyspark-sql-connect", - dependencies=[pyspark_core, hive, avro], +pyspark_connect = Module( + name="pyspark-connect", + dependencies=[pyspark_sql, connect], source_file_regexes=["python/pyspark/sql/connect"], python_test_goals=[ # doctests # No doctests yet. # unittests - "pyspark.sql.tests.connect.test_column_expressions", - "pyspark.sql.tests.connect.test_plan_only", - "pyspark.sql.tests.connect.test_select_ops", - "pyspark.sql.tests.connect.test_spark_connect", + "pyspark.sql.tests.test_connect_column_expressions", + "pyspark.sql.tests.test_connect_plan_only", + "pyspark.sql.tests.test_connect_select_ops", + "pyspark.sql.tests.test_connect_basic", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py index 800b7e0d932..11d64c4f0bc 100755 --- a/dev/sparktestsupport/utils.py +++ b/dev/sparktestsupport/utils.py @@ -108,23 +108,23 @@ def determine_modules_to_test(changed_modules, deduplicated=True): ['graphx', 'examples'] >>> [x.name for x in determine_modules_to_test([modules.sql])] ... # doctest: +NORMALIZE_WHITESPACE - ['sql', 'avro', 'docker-integration-tests', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', - 'hive-thriftserver', 'pyspark-sql', 'pyspark-sql-connect', 'repl', 'sparkr', + ['sql', 'avro', 'connect', 'docker-integration-tests', 'hive', 'mllib', 'sql-kafka-0-10', + 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-connect', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-ml'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sparkr, modules.sql], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE - ['avro', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', 'mllib', - 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-sql', - 'pyspark-sql-connect', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', + 'mllib', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', + 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sql, modules.core], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE - ['avro', 'catalyst', 'core', 'docker-integration-tests', 'examples', 'graphx', 'hive', - 'hive-thriftserver', 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', - 'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', - 'pyspark-sql-connect', 'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql', - 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl'] + ['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx', + 'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'pyspark-connect', 'pyspark-core', + 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-slow', 'pyspark-resource', + 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', + 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: diff --git a/pom.xml b/pom.xml index 371bb0a8e8b..e6f6e999576 100644 --- a/pom.xml +++ b/pom.xml @@ -100,7 +100,7 @@ <module>connector/kafka-0-10-assembly</module> <module>connector/kafka-0-10-sql</module> <module>connector/avro</module> - <module>connect</module> + <module>connector/connect</module> <!-- See additional modules enabled by profiles below --> </modules> diff --git a/python/mypy.ini b/python/mypy.ini index 1094f33e833..baf1a4048eb 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -26,12 +26,6 @@ warn_redundant_casts = True [mypy-pyspark.sql.connect.proto.*] ignore_errors = True -; TODO(SPARK-40537) reenable mypi support. -[mypy-pyspark.sql.tests.connect.*] -disallow_untyped_defs = False -ignore_missing_imports = True -ignore_errors = True - ; Allow untyped def in internal modules and tests [mypy-pyspark.daemon] @@ -78,6 +72,9 @@ disallow_untyped_defs = False [mypy-pyspark.sql.tests.*] disallow_untyped_defs = False +; TODO(SPARK-40537) reenable mypi support. +ignore_missing_imports = True +ignore_errors = True [mypy-pyspark.sql.pandas.serializers] disallow_untyped_defs = False diff --git a/python/pyspark/sql/connect/README.md b/python/pyspark/sql/connect/README.md index e79e9aae9dd..ac3926a28b9 100644 --- a/python/pyspark/sql/connect/README.md +++ b/python/pyspark/sql/connect/README.md @@ -1,5 +1,4 @@ - -# [EXPERIMENTAL] Spark Connect +# Spark Connect **Spark Connect is a strictly experimental feature and under heavy development. All APIs should be considered volatile and should not be used in production.** @@ -8,30 +7,32 @@ This module contains the implementation of Spark Connect which is a logical plan facade for the implementation in Spark. Spark Connect is directly integrated into the build of Spark. To enable it, you only need to activate the driver plugin for Spark Connect. - - - ## Build 1. Build Spark as usual per the documentation. + 2. Build and package the Spark Connect package + ```bash ./build/mvn -Phive package ``` + or - ```shell + + ```bash ./build/sbt -Phive package ``` ## Run Spark Shell ```bash -./bin/spark-shell --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin +./bin/spark-shell \ + --packages org.apache.spark:spark-connect_2.12:3.4.0 \ + --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin ``` ## Run Tests - ```bash ./run-tests --testnames 'pyspark.sql.tests.connect.test_spark_connect' ``` diff --git a/python/pyspark/sql/tests/connect/__init__.py b/python/pyspark/sql/tests/connect/__init__.py deleted file mode 100644 index cce3acad34a..00000000000 --- a/python/pyspark/sql/tests/connect/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/python/pyspark/sql/tests/connect/utils/__init__.py b/python/pyspark/sql/tests/connect/utils/__init__.py deleted file mode 100644 index b95812c8a29..00000000000 --- a/python/pyspark/sql/tests/connect/utils/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.sql.tests.connect.utils.spark_connect_test_utils import ( # noqa: F401 - PlanOnlyTestFixture, # noqa: F401 -) # noqa: F401 diff --git a/python/pyspark/sql/tests/connect/test_spark_connect.py b/python/pyspark/sql/tests/test_connect_basic.py similarity index 91% rename from python/pyspark/sql/tests/connect/test_spark_connect.py rename to python/pyspark/sql/tests/test_connect_basic.py index 7e891c5cf19..3e83e1bd6ea 100644 --- a/python/pyspark/sql/tests/connect/test_spark_connect.py +++ b/python/pyspark/sql/tests/test_connect_basic.py @@ -22,13 +22,18 @@ import tempfile from pyspark.sql import SparkSession, Row from pyspark.sql.connect.client import RemoteSparkSession from pyspark.sql.connect.function_builder import udf +from pyspark.testing.connectutils import should_test_connect, connect_requirement_message from pyspark.testing.utils import ReusedPySparkTestCase +@unittest.skipIf(not should_test_connect, connect_requirement_message) class SparkConnectSQLTestCase(ReusedPySparkTestCase): """Parent test fixture class for all Spark Connect related test cases.""" + connect = RemoteSparkSession + tbl_name = str + @classmethod def setUpClass(cls: Any) -> None: ReusedPySparkTestCase.setUpClass() @@ -55,7 +60,6 @@ class SparkConnectSQLTestCase(ReusedPySparkTestCase): class SparkConnectTests(SparkConnectSQLTestCase): def test_simple_read(self) -> None: - """Tests that we can access the Spark Connect GRPC service locally.""" df = self.connect.read.table(self.tbl_name) data = df.limit(10).toPandas() # Check that the limit is applied @@ -77,7 +81,7 @@ class SparkConnectTests(SparkConnectSQLTestCase): if __name__ == "__main__": - from pyspark.sql.tests.connect.test_spark_connect import * # noqa: F401 + from pyspark.sql.tests.test_connect_basic import * # noqa: F401 try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/connect/test_column_expressions.py b/python/pyspark/sql/tests/test_connect_column_expressions.py similarity index 94% rename from python/pyspark/sql/tests/connect/test_column_expressions.py rename to python/pyspark/sql/tests/test_connect_column_expressions.py index 1f067bf7995..fc80d137b6c 100644 --- a/python/pyspark/sql/tests/connect/test_column_expressions.py +++ b/python/pyspark/sql/tests/test_connect_column_expressions.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from pyspark.sql.tests.connect.utils import PlanOnlyTestFixture +from pyspark.testing.connectutils import PlanOnlyTestFixture import pyspark.sql.connect as c import pyspark.sql.connect.plan as p import pyspark.sql.connect.column as col - import pyspark.sql.connect.functions as fun @@ -54,7 +53,7 @@ class SparkConnectColumnExpressionSuite(PlanOnlyTestFixture): if __name__ == "__main__": import unittest - from pyspark.sql.tests.connect.test_column_expressions import * # noqa: F401 + from pyspark.sql.tests.test_connect_column_expressions import * # noqa: F401 try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/connect/test_plan_only.py b/python/pyspark/sql/tests/test_connect_plan_only.py similarity index 94% rename from python/pyspark/sql/tests/connect/test_plan_only.py rename to python/pyspark/sql/tests/test_connect_plan_only.py index 9e6d30cbe1f..ad59a682e9b 100644 --- a/python/pyspark/sql/tests/connect/test_plan_only.py +++ b/python/pyspark/sql/tests/test_connect_plan_only.py @@ -16,10 +16,10 @@ # import unittest +from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.sql.connect import DataFrame from pyspark.sql.connect.plan import Read from pyspark.sql.connect.function_builder import UserDefinedFunction, udf -from pyspark.sql.tests.connect.utils.spark_connect_test_utils import PlanOnlyTestFixture from pyspark.sql.types import StringType @@ -64,7 +64,7 @@ class SparkConnectTestsPlanOnly(PlanOnlyTestFixture): if __name__ == "__main__": - from pyspark.sql.tests.connect.test_plan_only import * # noqa: F401 + from pyspark.sql.tests.test_connect_plan_only import * # noqa: F401 try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/connect/test_select_ops.py b/python/pyspark/sql/tests/test_connect_select_ops.py similarity index 92% rename from python/pyspark/sql/tests/connect/test_select_ops.py rename to python/pyspark/sql/tests/test_connect_select_ops.py index 818f82b33e8..fc624b0d5cc 100644 --- a/python/pyspark/sql/tests/connect/test_select_ops.py +++ b/python/pyspark/sql/tests/test_connect_select_ops.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from pyspark.sql.tests.connect.utils import PlanOnlyTestFixture +from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.sql.connect import DataFrame from pyspark.sql.connect.functions import col from pyspark.sql.connect.plan import Read, InputValidationError @@ -29,7 +29,7 @@ class SparkConnectSelectOpsSuite(PlanOnlyTestFixture): if __name__ == "__main__": import unittest - from pyspark.sql.tests.connect.test_select_ops import * # noqa: F401 + from pyspark.sql.tests.test_connect_select_ops import * # noqa: F401 try: import xmlrunner # type: ignore diff --git a/python/pyspark/sql/tests/connect/utils/spark_connect_test_utils.py b/python/pyspark/testing/connectutils.py similarity index 60% rename from python/pyspark/sql/tests/connect/utils/spark_connect_test_utils.py rename to python/pyspark/testing/connectutils.py index 34bf49db494..dc66526010f 100644 --- a/python/pyspark/sql/tests/connect/utils/spark_connect_test_utils.py +++ b/python/pyspark/testing/connectutils.py @@ -14,11 +14,31 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os from typing import Any, Dict import functools import unittest import uuid +from pyspark.testing.utils import search_jar + + +connect_jar = search_jar("connector/connect", "spark-connect-assembly-", "spark-connect") +if connect_jar is None: + connect_requirement_message = ( + "Skipping all Spark Connect Python tests as the optional Spark Connect project was " + "not compiled into a JAR. To run these tests, you need to build Spark with " + "'build/sbt package' or 'build/mvn package' before running this test." + ) +else: + existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") + jars_args = "--jars %s" % connect_jar + plugin_args = "--conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin" + os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, plugin_args, existing_args]) + connect_requirement_message = None # type: ignore + +should_test_connect = connect_requirement_message is None + class MockRemoteSession: def __init__(self) -> None: @@ -33,6 +53,7 @@ class MockRemoteSession: return functools.partial(self.hooks[item]) +@unittest.skipIf(not should_test_connect, connect_requirement_message) class PlanOnlyTestFixture(unittest.TestCase): @classmethod def setUpClass(cls: Any) -> None: diff --git a/python/run-tests.py b/python/run-tests.py index af4c6f1c94b..19e39c822cb 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -110,15 +110,6 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_ metastore_dir = os.path.join(metastore_dir, str(uuid.uuid4())) os.mkdir(metastore_dir) - # Check if we should enable the SparkConnectPlugin - additional_config = [] - if test_name.startswith("pyspark.sql.tests.connect"): - # Adding Spark Connect JAR and Config - additional_config += [ - "--conf", - "spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin" - ] - # Also override the JVM's temp directory by setting driver and executor options. java_options = "-Djava.io.tmpdir={0}".format(tmp_dir) java_options = java_options + " -Dio.netty.tryReflectionSetAccessible=true -Xss4M" @@ -126,9 +117,8 @@ def run_individual_python_test(target_dir, test_name, pyspark_python, keep_test_ "--conf", "spark.driver.extraJavaOptions='{0}'".format(java_options), "--conf", "spark.executor.extraJavaOptions='{0}'".format(java_options), "--conf", "spark.sql.warehouse.dir='{0}'".format(metastore_dir), + "pyspark-shell", ] - spark_args += additional_config - spark_args += ["pyspark-shell"] env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org