This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new 864b89b [SPARK-38563][PYTHON] Upgrade to Py4J 0.10.9.5 864b89b is described below commit 864b89bbea8c2620938b28661fc8bea6271c7048 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Fri Mar 18 14:00:48 2022 +0900 [SPARK-38563][PYTHON] Upgrade to Py4J 0.10.9.5 This PR is a retry of https://github.com/apache/spark/pull/35871 with bumping up the version to 0.10.9.5. It was reverted because of Python 3.10 is broken, and Python 3.10 was not officially supported in Py4J. In Py4J 0.10.9.5, the issue was fixed (https://github.com/py4j/py4j/pull/475), and it added Python 3.10 support officially with CI set up (https://github.com/py4j/py4j/pull/477). See https://github.com/apache/spark/pull/35871 See https://github.com/apache/spark/pull/35871 Py4J sets up Python 3.10 CI now, and I manually tested PySpark with Python 3.10 with this patch: ```bash ./bin/pyspark ``` ``` import py4j py4j.__version__ spark.range(10).show() ``` ``` Using Python version 3.10.0 (default, Mar 3 2022 03:57:21) Spark context Web UI available at http://172.30.5.50:4040 Spark context available as 'sc' (master = local[*], app id = local-1647571387534). SparkSession available as 'spark'. >>> import py4j >>> py4j.__version__ '0.10.9.5' >>> spark.range(10).show() +---+ | id| +---+ ... ``` Closes #35907 from HyukjinKwon/SPARK-38563-followup. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 97335ea037a9a036c013c86ef62d74ca638f808e) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- bin/pyspark | 2 +- bin/pyspark2.cmd | 2 +- core/pom.xml | 2 +- .../org/apache/spark/api/python/PythonUtils.scala | 2 +- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 2 +- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 2 +- docs/job-scheduling.md | 2 +- python/docs/Makefile | 2 +- python/docs/make2.bat | 2 +- python/docs/source/getting_started/install.rst | 2 +- python/lib/py4j-0.10.9.3-src.zip | Bin 42021 -> 0 bytes python/lib/py4j-0.10.9.5-src.zip | Bin 0 -> 42404 bytes python/pyspark/context.py | 6 ++-- python/pyspark/util.py | 32 +++------------------ python/setup.py | 2 +- sbin/spark-config.sh | 2 +- 16 files changed, 19 insertions(+), 43 deletions(-) diff --git a/bin/pyspark b/bin/pyspark index 4840589..21a514e 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS # Add the PySpark classes to the Python path: export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" -export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH" +export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index a19627a..eec02a4 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" ( ) set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% -set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.3-src.zip;%PYTHONPATH% +set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.5-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py diff --git a/core/pom.xml b/core/pom.xml index 3833794..4926664 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -433,7 +433,7 @@ <dependency> <groupId>net.sf.py4j</groupId> <artifactId>py4j</artifactId> - <version>0.10.9.3</version> + <version>0.10.9.5</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 8daba86..6336171 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -27,7 +27,7 @@ import org.apache.spark.SparkContext import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} private[spark] object PythonUtils { - val PY4J_ZIP_NAME = "py4j-0.10.9.3-src.zip" + val PY4J_ZIP_NAME = "py4j-0.10.9.5-src.zip" /** Get the PYTHONPATH for PySpark, either from SPARK_HOME, if it is set, or from our JAR */ def sparkPythonPath: String = { diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index c2882bd..23da066 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -208,7 +208,7 @@ parquet-format-structures/1.12.2//parquet-format-structures-1.12.2.jar parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar -py4j/0.10.9.3//py4j-0.10.9.3.jar +py4j/0.10.9.5//py4j-0.10.9.5.jar pyrolite/4.30//pyrolite-4.30.jar rocksdbjni/6.20.3//rocksdbjni-6.20.3.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index be4c7b8..fb69fc5 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -179,7 +179,7 @@ parquet-format-structures/1.12.2//parquet-format-structures-1.12.2.jar parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar -py4j/0.10.9.3//py4j-0.10.9.3.jar +py4j/0.10.9.5//py4j-0.10.9.5.jar pyrolite/4.30//pyrolite-4.30.jar rocksdbjni/6.20.3//rocksdbjni-6.20.3.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md index 65305fe..40dbd5a 100644 --- a/docs/job-scheduling.md +++ b/docs/job-scheduling.md @@ -301,5 +301,5 @@ via `sc.setJobGroup` in a separate PVM thread, which also disallows to cancel th later. `pyspark.InheritableThread` is recommended to use together for a PVM thread to inherit the inheritable attributes - such as local properties in a JVM thread, and to avoid resource leak. + such as local properties in a JVM thread. diff --git a/python/docs/Makefile b/python/docs/Makefile index 9cb1a17..14e5214 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -21,7 +21,7 @@ SPHINXBUILD ?= sphinx-build SOURCEDIR ?= source BUILDDIR ?= build -export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.3-src.zip) +export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.9.5-src.zip) # Put it first so that "make" without argument is like "make help". help: diff --git a/python/docs/make2.bat b/python/docs/make2.bat index 2e4e2b5..d36b7a1 100644 --- a/python/docs/make2.bat +++ b/python/docs/make2.bat @@ -25,7 +25,7 @@ if "%SPHINXBUILD%" == "" ( set SOURCEDIR=source set BUILDDIR=build -set PYTHONPATH=..;..\lib\py4j-0.10.9.3-src.zip +set PYTHONPATH=..;..\lib\py4j-0.10.9.5-src.zip if "%1" == "" goto help diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index ee66067..e98b7d2 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -157,7 +157,7 @@ Package Minimum supported version Note `pandas` 0.23.2 Optional for Spark SQL `NumPy` 1.7 Required for MLlib DataFrame-based API `pyarrow` 1.0.0 Optional for Spark SQL -`Py4J` 0.10.9.3 Required +`Py4J` 0.10.9.5 Required `pandas` 0.23.2 Required for pandas API on Spark `pyarrow` 1.0.0 Required for pandas API on Spark `Numpy` 1.14 Required for pandas API on Spark diff --git a/python/lib/py4j-0.10.9.3-src.zip b/python/lib/py4j-0.10.9.3-src.zip deleted file mode 100644 index 428f3ac..0000000 Binary files a/python/lib/py4j-0.10.9.3-src.zip and /dev/null differ diff --git a/python/lib/py4j-0.10.9.5-src.zip b/python/lib/py4j-0.10.9.5-src.zip new file mode 100644 index 0000000..478d4b0 Binary files /dev/null and b/python/lib/py4j-0.10.9.5-src.zip differ diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 6c94106..4d58dd1 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -1113,7 +1113,7 @@ class SparkContext(object): to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead. If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread - local inheritance, and preventing resource leak. + local inheritance. Examples -------- @@ -1153,7 +1153,7 @@ class SparkContext(object): Notes ----- If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread - local inheritance, and preventing resource leak. + local inheritance. """ self._jsc.setLocalProperty(key, value) @@ -1171,7 +1171,7 @@ class SparkContext(object): Notes ----- If you run jobs in parallel, use :class:`pyspark.InheritableThread` for thread - local inheritance, and preventing resource leak. + local inheritance. """ self._jsc.setJobDescription(value) diff --git a/python/pyspark/util.py b/python/pyspark/util.py index e0933f1..1e2c848 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -320,12 +320,9 @@ def inheritable_thread_target(f): @functools.wraps(f) def wrapped(*args, **kwargs): - try: - # Set local properties in child thread. - SparkContext._active_spark_context._jsc.sc().setLocalProperties(properties) - return f(*args, **kwargs) - finally: - InheritableThread._clean_py4j_conn_for_current_thread() + # Set local properties in child thread. + SparkContext._active_spark_context._jsc.sc().setLocalProperties(properties) + return f(*args, **kwargs) return wrapped else: return f @@ -360,10 +357,7 @@ class InheritableThread(threading.Thread): # self._props is set before starting the thread to match the behavior with JVM. assert hasattr(self, "_props") SparkContext._active_spark_context._jsc.sc().setLocalProperties(self._props) - try: - return target(*a, **k) - finally: - InheritableThread._clean_py4j_conn_for_current_thread() + return target(*a, **k) super(InheritableThread, self).__init__( target=copy_local_properties, *args, **kwargs) @@ -380,24 +374,6 @@ class InheritableThread(threading.Thread): self._props = SparkContext._active_spark_context._jsc.sc().getLocalProperties().clone() return super(InheritableThread, self).start(*args, **kwargs) - @staticmethod - def _clean_py4j_conn_for_current_thread(): - from pyspark import SparkContext - - jvm = SparkContext._jvm - thread_connection = jvm._gateway_client.get_thread_connection() - if thread_connection is not None: - try: - # Dequeue is shared across other threads but it's thread-safe. - # If this function has to be invoked one more time in the same thead - # Py4J will create a new connection automatically. - jvm._gateway_client.deque.remove(thread_connection) - except ValueError: - # Should never reach this point - return - finally: - thread_connection.close() - if __name__ == "__main__": if "pypy" not in platform.python_implementation().lower() and sys.version_info[:2] >= (3, 7): diff --git a/python/setup.py b/python/setup.py index 962f232..8a873c2 100755 --- a/python/setup.py +++ b/python/setup.py @@ -258,7 +258,7 @@ try: license='http://www.apache.org/licenses/LICENSE-2.0', # Don't forget to update python/docs/source/getting_started/install.rst # if you're updating the versions or dependencies. - install_requires=['py4j==0.10.9.3'], + install_requires=['py4j==0.10.9.5'], extras_require={ 'ml': ['numpy>=1.7'], 'mllib': ['numpy>=1.7'], diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh index f27b6fe..6044de2 100755 --- a/sbin/spark-config.sh +++ b/sbin/spark-config.sh @@ -28,6 +28,6 @@ export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}" # Add the PySpark classes to the PYTHONPATH: if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}" - export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:${PYTHONPATH}" + export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:${PYTHONPATH}" export PYSPARK_PYTHONPATH_SET=1 fi --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org