This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5299e5423f4 [SPARK-44823][PYTHON] Update black to 23.9.1 and fix erroneous check 5299e5423f4 is described below commit 5299e5423f439ce702d14666c79f9d42eb446943 Author: panbingkun <pbk1...@gmail.com> AuthorDate: Mon Sep 18 16:34:34 2023 +0900 [SPARK-44823][PYTHON] Update black to 23.9.1 and fix erroneous check ### What changes were proposed in this pull request? The pr aims to update black from 22.6.0 to 23.9.1 and fix erroneous check. ### Why are the changes needed? The date of `22.6.0` release is `Jun 28, 2022`, it has been over a year since now. (https://pypi.org/project/black/23.7.0/#history) Make PySpark's code style more in line with the latest version of Black's requirements. Release notes: - 23.9.1: https://github.com/psf/black/blob/main/CHANGES.md#2391 <img width="968" alt="image" src="https://github.com/apache/spark/assets/15246973/2d7235b8-c846-45a0-8e03-40625b1a1f71"> - 23.9.0: https://github.com/psf/black/blob/main/CHANGES.md#2390 <img width="465" alt="image" src="https://github.com/apache/spark/assets/15246973/225e6e56-79e1-4d36-a47f-26f7bfd4de3e"> - 23.7.0: https://github.com/psf/black/blob/main/CHANGES.md#2370 <img width="740" alt="image" src="https://github.com/apache/spark/assets/15246973/ec42aab0-1abe-43cf-af4e-7338a4f698e7"> - 23.3.0: https://github.com/psf/black/blob/main/CHANGES.md#2330 - 23.1.0: https://github.com/psf/black/blob/main/CHANGES.md#2310 <img width="648" alt="image" src="https://github.com/apache/spark/assets/15246973/493bec0e-e7be-4c31-8e01-81b9e729099b"> - 22.12.0: https://github.com/psf/black/blob/main/CHANGES.md#22120 - 22.10.0: https://github.com/psf/black/blob/main/CHANGES.md#22100 - 22.8.0: https://github.com/psf/black/blob/main/CHANGES.md#2280 <img width="726" alt="image" src="https://github.com/apache/spark/assets/15246973/f643c228-b4ed-4d5f-8668-0fcfb3ca6b65"> <img width="724" alt="image" src="https://github.com/apache/spark/assets/15246973/5c7dae77-ab6e-4e1c-a438-55ca0ce99f60"> <img width="582" alt="image" src="https://github.com/apache/spark/assets/15246973/4de07cda-1777-4186-953d-4657bb1e0ee3"> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. Closes #42507 from panbingkun/SPARK-44823. Authored-by: panbingkun <pbk1...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .github/workflows/build_and_test.yml | 2 +- dev/pyproject.toml | 2 +- dev/reformat-python | 2 +- dev/requirements.txt | 2 +- dev/run-tests.py | 3 +- python/pyspark/conf.py | 4 +- python/pyspark/context.py | 2 +- python/pyspark/instrumentation_utils.py | 3 - python/pyspark/join.py | 8 +- python/pyspark/ml/connect/evaluation.py | 1 - .../pyspark/ml/deepspeed/deepspeed_distributor.py | 1 - python/pyspark/ml/linalg/__init__.py | 2 - python/pyspark/ml/param/__init__.py | 1 - python/pyspark/ml/tests/test_algorithms.py | 7 -- python/pyspark/ml/tests/test_dl_util.py | 1 - python/pyspark/ml/tests/test_feature.py | 1 - python/pyspark/ml/tests/test_linalg.py | 2 - python/pyspark/ml/torch/distributor.py | 1 - python/pyspark/ml/tuning.py | 2 +- python/pyspark/ml/util.py | 1 - python/pyspark/ml/wrapper.py | 1 - python/pyspark/mllib/linalg/__init__.py | 2 - python/pyspark/mllib/tests/test_feature.py | 1 - python/pyspark/mllib/tests/test_linalg.py | 2 - python/pyspark/pandas/frame.py | 7 +- python/pyspark/pandas/groupby.py | 1 - python/pyspark/pandas/indexing.py | 1 - python/pyspark/pandas/numpy_compat.py | 1 - python/pyspark/pandas/plot/matplotlib.py | 1 - python/pyspark/pandas/spark/functions.py | 4 - python/pyspark/pandas/sql_processor.py | 2 +- python/pyspark/pandas/strings.py | 1 + .../tests/connect/groupby/test_parity_aggregate.py | 1 - .../tests/connect/groupby/test_parity_describe.py | 1 - .../tests/connect/groupby/test_parity_head_tail.py | 1 - .../tests/connect/groupby/test_parity_index.py | 1 - .../tests/connect/groupby/test_parity_stat.py | 1 - .../tests/connect/series/test_parity_all_any.py | 1 - .../tests/connect/series/test_parity_as_type.py | 1 - .../tests/connect/series/test_parity_conversion.py | 1 - .../tests/connect/series/test_parity_sort.py | 1 - .../pandas/tests/data_type_ops/test_string_ops.py | 1 - .../pyspark/pandas/tests/groupby/test_aggregate.py | 1 - .../pyspark/pandas/tests/groupby/test_groupby.py | 1 - .../pyspark/pandas/tests/indexes/test_category.py | 1 - .../pyspark/pandas/tests/plot/test_frame_plot.py | 1 - .../pandas/tests/test_dataframe_spark_io.py | 1 - python/pyspark/pandas/tests/test_default_index.py | 1 - python/pyspark/pandas/tests/test_utils.py | 1 - python/pyspark/profiler.py | 4 +- python/pyspark/rdd.py | 2 - python/pyspark/serializers.py | 4 +- python/pyspark/sql/connect/client/core.py | 1 - python/pyspark/sql/connect/conversion.py | 8 -- python/pyspark/sql/connect/expressions.py | 1 - python/pyspark/sql/connect/plan.py | 1 - python/pyspark/sql/connect/proto/base_pb2.py | 1 - python/pyspark/sql/connect/proto/base_pb2.pyi | 98 ++++++++++-------- python/pyspark/sql/connect/proto/catalog_pb2.py | 1 - python/pyspark/sql/connect/proto/catalog_pb2.pyi | 59 +++++------ python/pyspark/sql/connect/proto/commands_pb2.py | 1 - python/pyspark/sql/connect/proto/commands_pb2.pyi | 112 ++++++++++++--------- python/pyspark/sql/connect/proto/common_pb2.py | 1 - .../sql/connect/proto/example_plugins_pb2.py | 1 - .../pyspark/sql/connect/proto/expressions_pb2.py | 1 - .../pyspark/sql/connect/proto/expressions_pb2.pyi | 92 +++++++++-------- python/pyspark/sql/connect/proto/relations_pb2.py | 1 - python/pyspark/sql/connect/proto/relations_pb2.pyi | 109 ++++++++++---------- python/pyspark/sql/connect/proto/types_pb2.py | 1 - python/pyspark/sql/connect/proto/types_pb2.pyi | 55 +++++----- python/pyspark/sql/connect/session.py | 1 - python/pyspark/sql/connect/window.py | 1 - python/pyspark/sql/context.py | 1 - python/pyspark/sql/pandas/conversion.py | 1 - python/pyspark/sql/pandas/functions.py | 1 - python/pyspark/sql/pandas/types.py | 8 -- python/pyspark/sql/session.py | 2 +- .../sql/tests/connect/test_connect_function.py | 4 - .../pyspark/sql/tests/connect/test_connect_plan.py | 3 +- python/pyspark/sql/tests/connect/test_session.py | 1 - .../sql/tests/pandas/test_pandas_grouped_map.py | 4 - .../sql/tests/streaming/test_streaming_foreach.py | 1 - python/pyspark/sql/tests/test_dataframe.py | 1 - python/pyspark/sql/tests/test_session.py | 2 - python/pyspark/sql/utils.py | 6 -- python/pyspark/streaming/tests/test_context.py | 1 - python/pyspark/streaming/tests/test_dstream.py | 2 - python/pyspark/streaming/tests/test_listener.py | 1 - python/pyspark/testing/mlutils.py | 1 - python/pyspark/testing/pandasutils.py | 2 - python/pyspark/testing/streamingutils.py | 1 - python/pyspark/tests/test_install_spark.py | 2 +- python/pyspark/tests/test_pin_thread.py | 1 - python/pyspark/worker.py | 1 - python/pyspark/worker_util.py | 2 +- 95 files changed, 312 insertions(+), 382 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ddc4fc6fd2c..9c5d25d30af 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -685,7 +685,7 @@ jobs: # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' + python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - name: Python linter run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python diff --git a/dev/pyproject.toml b/dev/pyproject.toml index ee805ef7330..4f462d14c78 100644 --- a/dev/pyproject.toml +++ b/dev/pyproject.toml @@ -27,7 +27,7 @@ testpaths = [ [tool.black] # When changing the version, we have to update # GitHub workflow version and dev/reformat-python -required-version = "22.6.0" +required-version = "23.9.1" line-length = 100 target-version = ['py38'] include = '\.pyi?$' diff --git a/dev/reformat-python b/dev/reformat-python index 9543f5713d1..d2a56f18c39 100755 --- a/dev/reformat-python +++ b/dev/reformat-python @@ -22,7 +22,7 @@ FWDIR="$( cd "$DIR"/.. && pwd )" cd "$FWDIR" BLACK_BUILD="${PYTHON_EXECUTABLE} -m black" -BLACK_VERSION="22.6.0" +BLACK_VERSION="23.9.1" $PYTHON_EXECUTABLE -c 'import black' 2> /dev/null if [ $? -ne 0 ]; then echo "The Python library providing the 'black' module was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." diff --git a/dev/requirements.txt b/dev/requirements.txt index 51fcb719e99..88cb90da6a8 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -46,7 +46,7 @@ jira>=3.5.2 PyGithub # pandas API on Spark Code formatter. -black==22.6.0 +black==23.9.1 py # Spark Connect (required) diff --git a/dev/run-tests.py b/dev/run-tests.py index 9bf3095edb7..559e2017be1 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -37,7 +37,7 @@ import sparktestsupport.modules as modules def setup_test_environ(environ): print("[info] Setup the following environment variables for tests: ") - for (k, v) in environ.items(): + for k, v in environ.items(): print("%s=%s" % (k, v)) os.environ[k] = v @@ -331,7 +331,6 @@ def run_scala_tests_maven(test_profiles): def run_scala_tests_sbt(test_modules, test_profiles): - sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) if not sbt_test_goals: diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index dd203e2d26b..ddf8c22feea 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -190,7 +190,7 @@ class SparkConf: elif key is not None: self.set("spark.executorEnv.{}".format(key), cast(str, value)) elif pairs is not None: - for (k, v) in pairs: + for k, v in pairs: self.set("spark.executorEnv.{}".format(k), v) return self @@ -203,7 +203,7 @@ class SparkConf: pairs : iterable of tuples list of key-value pairs to set """ - for (k, v) in pairs: + for k, v in pairs: self.set(k, v) return self diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 81adb6ced33..04bcb07dd8d 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -285,7 +285,7 @@ class SparkContext: self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) - for (k, v) in self._conf.getAll(): + for k, v in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv.") :] self.environment[varName] = v diff --git a/python/pyspark/instrumentation_utils.py b/python/pyspark/instrumentation_utils.py index 256c09068cd..8fe579ac14f 100644 --- a/python/pyspark/instrumentation_utils.py +++ b/python/pyspark/instrumentation_utils.py @@ -31,7 +31,6 @@ _local = threading.local() def _wrap_function(class_name: str, function_name: str, func: Callable, logger: Any) -> Callable: - signature = inspect.signature(func) @functools.wraps(func) @@ -91,7 +90,6 @@ def _wrap_property(class_name: str, property_name: str, prop: Any, logger: Any) def _wrap_missing_function( class_name: str, function_name: str, func: Callable, original: Any, logger: Any ) -> Any: - if not hasattr(original, function_name): return func @@ -110,7 +108,6 @@ def _wrap_missing_function( def _wrap_missing_property(class_name: str, property_name: str, prop: Any, logger: Any) -> Any: - is_deprecated = prop.fget.__name__ == "deprecated_property" @property # type: ignore[misc] diff --git a/python/pyspark/join.py b/python/pyspark/join.py index 040c9462292..003e9ec2fc8 100644 --- a/python/pyspark/join.py +++ b/python/pyspark/join.py @@ -44,7 +44,7 @@ def _do_python_join(rdd, other, numPartitions, dispatch): def python_join(rdd, other, numPartitions): def dispatch(seq): vbuf, wbuf = [], [] - for (n, v) in seq: + for n, v in seq: if n == 1: vbuf.append(v) elif n == 2: @@ -57,7 +57,7 @@ def python_join(rdd, other, numPartitions): def python_right_outer_join(rdd, other, numPartitions): def dispatch(seq): vbuf, wbuf = [], [] - for (n, v) in seq: + for n, v in seq: if n == 1: vbuf.append(v) elif n == 2: @@ -72,7 +72,7 @@ def python_right_outer_join(rdd, other, numPartitions): def python_left_outer_join(rdd, other, numPartitions): def dispatch(seq): vbuf, wbuf = [], [] - for (n, v) in seq: + for n, v in seq: if n == 1: vbuf.append(v) elif n == 2: @@ -87,7 +87,7 @@ def python_left_outer_join(rdd, other, numPartitions): def python_full_outer_join(rdd, other, numPartitions): def dispatch(seq): vbuf, wbuf = [], [] - for (n, v) in seq: + for n, v in seq: if n == 1: vbuf.append(v) elif n == 2: diff --git a/python/pyspark/ml/connect/evaluation.py b/python/pyspark/ml/connect/evaluation.py index 8124f100f31..d728867f086 100644 --- a/python/pyspark/ml/connect/evaluation.py +++ b/python/pyspark/ml/connect/evaluation.py @@ -28,7 +28,6 @@ from pyspark.sql import DataFrame class _TorchMetricEvaluator(Evaluator): - metricName: Param[str] = Param( Params._dummy(), "metricName", diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py index 7c2b8c43526..4ac5ff2fb42 100644 --- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py +++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py @@ -30,7 +30,6 @@ from pyspark.ml.torch.distributor import TorchDistributor class DeepspeedTorchDistributor(TorchDistributor): - _DEEPSPEED_SSL_CONF = "deepspeed.spark.distributor.ignoreSsl" def __init__( diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py index a5a849e4aa3..0fde6392516 100644 --- a/python/pyspark/ml/linalg/__init__.py +++ b/python/pyspark/ml/linalg/__init__.py @@ -281,7 +281,6 @@ class MatrixUDT(UserDefinedType): class Vector: - __UDT__ = VectorUDT() """ @@ -1024,7 +1023,6 @@ class Vectors: class Matrix: - __UDT__ = MatrixUDT() """ diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 74f7b0bc3c7..345b7f7a596 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -373,7 +373,6 @@ class Params(Identifiable, metaclass=ABCMeta): ... def getOrDefault(self, param: Union[str, Param[T]]) -> Union[Any, T]: - """ Gets the value of a param in the user-supplied param map or its default value. Raises an error if neither is set. diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index fb2507fe085..eeb342c4238 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -38,7 +38,6 @@ from pyspark.testing.mlutils import SparkSessionTestCase class LogisticRegressionTest(SparkSessionTestCase): def test_binomial_logistic_regression_with_bound(self): - df = self.spark.createDataFrame( [ (1.0, 1.0, Vectors.dense(0.0, 5.0)), @@ -60,7 +59,6 @@ class LogisticRegressionTest(SparkSessionTestCase): self.assertTrue(np.isclose(model.intercept, 0.0, atol=1e-4)) def test_multinomial_logistic_regression_with_bound(self): - data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) @@ -84,7 +82,6 @@ class LogisticRegressionTest(SparkSessionTestCase): ) def test_logistic_regression_with_threshold(self): - df = self.spark.createDataFrame( [ (1.0, 1.0, Vectors.dense(0.0, 5.0)), @@ -121,7 +118,6 @@ class LogisticRegressionTest(SparkSessionTestCase): class MultilayerPerceptronClassifierTest(SparkSessionTestCase): def test_raw_and_probability_prediction(self): - data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) @@ -375,7 +371,6 @@ class ALSTest(SparkSessionTestCase): class GeneralizedLinearRegressionTest(SparkSessionTestCase): def test_tweedie_distribution(self): - df = self.spark.createDataFrame( [ (1.0, Vectors.dense(0.0, 0.0)), @@ -396,7 +391,6 @@ class GeneralizedLinearRegressionTest(SparkSessionTestCase): self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1e-4)) def test_offset(self): - df = self.spark.createDataFrame( [ (0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), @@ -417,7 +411,6 @@ class GeneralizedLinearRegressionTest(SparkSessionTestCase): class LinearRegressionTest(SparkSessionTestCase): def test_linear_regression_with_huber_loss(self): - data_path = "data/mllib/sample_linear_regression_data.txt" df = self.spark.read.format("libsvm").load(data_path) diff --git a/python/pyspark/ml/tests/test_dl_util.py b/python/pyspark/ml/tests/test_dl_util.py index abb1c6cee7a..c9634b6c79f 100644 --- a/python/pyspark/ml/tests/test_dl_util.py +++ b/python/pyspark/ml/tests/test_dl_util.py @@ -26,7 +26,6 @@ from pyspark.ml.dl_util import FunctionPickler class TestFunctionPickler(unittest.TestCase): - # Function that will be used to test pickling. @staticmethod def _test_function(x: float, y: float) -> float: diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 0051d47ae33..4bf6641723d 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -367,7 +367,6 @@ class FeatureTests(SparkSessionTestCase): class HashingTFTest(SparkSessionTestCase): def test_apply_binary_term_freqs(self): - df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 10 hashingTF = HashingTF() diff --git a/python/pyspark/ml/tests/test_linalg.py b/python/pyspark/ml/tests/test_linalg.py index 6632d100ea5..66ba373b12f 100644 --- a/python/pyspark/ml/tests/test_linalg.py +++ b/python/pyspark/ml/tests/test_linalg.py @@ -320,7 +320,6 @@ class VectorTests(MLlibTestCase): class VectorUDTTests(MLlibTestCase): - dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) @@ -367,7 +366,6 @@ class VectorUDTTests(MLlibTestCase): class MatrixUDTTests(MLlibTestCase): - dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) diff --git a/python/pyspark/ml/torch/distributor.py b/python/pyspark/ml/torch/distributor.py index 8d33dabf3c8..11fab4f0778 100644 --- a/python/pyspark/ml/torch/distributor.py +++ b/python/pyspark/ml/torch/distributor.py @@ -896,7 +896,6 @@ class TorchDistributor(Distributor): *args: Any, **kwargs: Any, ) -> Any: - if not run_pytorch_file_fn: run_pytorch_file_fn = TorchDistributor._run_training_on_pytorch_file diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 63f51229a9f..fa5d604981b 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -184,7 +184,7 @@ class ParamGridBuilder: if isinstance(args[0], dict): self.baseOn(*args[0].items()) else: - for (param, value) in args: + for param, value in args: self.addGrid(param, [value]) return self diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 64676947017..59589404790 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -738,7 +738,6 @@ def try_remote_functions(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.ml.connect import functions diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 5eee3eeef11..6edddb0026d 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -422,7 +422,6 @@ class JavaModel(JavaTransformer, Model, metaclass=ABCMeta): """ super(JavaModel, self).__init__(java_model) if java_model is not None: - # SPARK-10931: This is a temporary fix to allow models to own params # from estimators. Eventually, these params should be in models through # using common base classes between estimators and models. diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index f752f5458ff..7eacb7bc906 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -297,7 +297,6 @@ class MatrixUDT(UserDefinedType): class Vector: - __UDT__ = VectorUDT() """ @@ -1200,7 +1199,6 @@ class Vectors: class Matrix: - __UDT__ = MatrixUDT() """ diff --git a/python/pyspark/mllib/tests/test_feature.py b/python/pyspark/mllib/tests/test_feature.py index ca06f39da23..756e4fc2436 100644 --- a/python/pyspark/mllib/tests/test_feature.py +++ b/python/pyspark/mllib/tests/test_feature.py @@ -118,7 +118,6 @@ class HashingTFTest(MLlibTestCase): class DimensionalityReductionTests(MLlibTestCase): - denseData = [ Vectors.dense([0.0, 1.0, 2.0]), Vectors.dense([3.0, 4.0, 5.0]), diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py index 6d8dfcd4203..a9837b1fdeb 100644 --- a/python/pyspark/mllib/tests/test_linalg.py +++ b/python/pyspark/mllib/tests/test_linalg.py @@ -409,7 +409,6 @@ class VectorTests(MLlibTestCase): class VectorUDTTests(MLlibTestCase): - dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) @@ -471,7 +470,6 @@ class VectorUDTTests(MLlibTestCase): class MatrixUDTTests(MLlibTestCase): - dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index fedbfcdb5f3..b19f55e7eba 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -7901,7 +7901,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})] ) column_label_names = self._internal.column_label_names.copy() - column_label_names[i], column_label_names[j], = ( + ( + column_label_names[i], + column_label_names[j], + ) = ( column_label_names[j], column_label_names[i], ) @@ -11306,7 +11309,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] level: Optional[int] = None, errors: str = "ignore", ) -> Optional["DataFrame"]: - """ Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series @@ -12976,7 +12978,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] if ( axis is None or axis == 1 ) and left._internal.column_labels != right._internal.column_labels: - if left._internal.column_labels_level != right._internal.column_labels_level: raise ValueError("cannot join with no overlapping index names") diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 9545beea6b7..bbc292cf744 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -2010,7 +2010,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): return_schema = StructType([field.struct_field for field in data_fields]) def pandas_groupby_apply(pdf: pd.DataFrame) -> pd.DataFrame: - if is_series_groupby: pdf_or_ser = pdf.groupby(groupkey_names)[name].apply(pandas_apply, *args, **kwargs) else: diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index cf8a2c0a363..460eb37af78 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1050,7 +1050,6 @@ class LocIndexer(LocIndexerLike): if (start is None and rows_sel.start is not None) or ( stop is None and rows_sel.stop is not None ): - inc = index_column.is_monotonic_increasing if inc is False: dec = index_column.is_monotonic_decreasing diff --git a/python/pyspark/pandas/numpy_compat.py b/python/pyspark/pandas/numpy_compat.py index 96f2ab15816..efffaa7042c 100644 --- a/python/pyspark/pandas/numpy_compat.py +++ b/python/pyspark/pandas/numpy_compat.py @@ -215,7 +215,6 @@ def maybe_dispatch_ufunc_to_spark_func( and (op_name in unary_np_spark_mappings or op_name in binary_np_spark_mappings) and kwargs.get("out") is None ): - np_spark_map_func = unary_np_spark_mappings.get(op_name) or binary_np_spark_mappings.get( op_name ) diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index 36cfc759f83..609ad271503 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -900,7 +900,6 @@ def _plot(data, x=None, y=None, subplots=False, ax=None, kind="line", **kwds): if kind in ("scatter", "hexbin"): plot_obj = klass(data, x, y, subplots=subplots, ax=ax, kind=kind, **kwds) else: - # check data type and do preprocess before applying plot if isinstance(data, DataFrame): if x is not None: diff --git a/python/pyspark/pandas/spark/functions.py b/python/pyspark/pandas/spark/functions.py index d7f3d81773c..9fef983b46a 100644 --- a/python/pyspark/pandas/spark/functions.py +++ b/python/pyspark/pandas/spark/functions.py @@ -50,7 +50,6 @@ def stddev(col: Column, ddof: int) -> Column: ) else: - sc = SparkContext._active_spark_context return Column(sc._jvm.PythonSQLUtils.pandasStddev(col._jc, ddof)) @@ -66,7 +65,6 @@ def var(col: Column, ddof: int) -> Column: ) else: - sc = SparkContext._active_spark_context return Column(sc._jvm.PythonSQLUtils.pandasVariance(col._jc, ddof)) @@ -81,7 +79,6 @@ def skew(col: Column) -> Column: ) else: - sc = SparkContext._active_spark_context return Column(sc._jvm.PythonSQLUtils.pandasSkewness(col._jc)) @@ -96,7 +93,6 @@ def kurt(col: Column) -> Column: ) else: - sc = SparkContext._active_spark_context return Column(sc._jvm.PythonSQLUtils.pandasKurtosis(col._jc)) diff --git a/python/pyspark/pandas/sql_processor.py b/python/pyspark/pandas/sql_processor.py index 0142e8d267e..bce6921e029 100644 --- a/python/pyspark/pandas/sql_processor.py +++ b/python/pyspark/pandas/sql_processor.py @@ -300,7 +300,7 @@ class SQLProcessor: # TODO: use a string builder res = "" try: - for (pre, inner, _, _) in blocks: + for pre, inner, _, _ in blocks: var_next = "" if inner is None else self._convert(inner) res = res + pre + var_next self._normalized_statement = res diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py index 37486b0cc81..bf9cafbaf09 100644 --- a/python/pyspark/pandas/strings.py +++ b/python/pyspark/pandas/strings.py @@ -1156,6 +1156,7 @@ class StringMethods: 2 [b, b] dtype: object """ + # type hint does not support to specify array type yet. @pandas_udf( # type: ignore[call-overload] returnType=ArrayType(StringType(), containsNull=True) diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py index e3c9f620e45..758266cc56b 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_aggregate.py @@ -24,7 +24,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParityAggregateTests( GroupbyAggregateMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): - pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py index 826e1b04ceb..21777715312 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_describe.py @@ -24,7 +24,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParityDescribeTests( GroupbyDescribeMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): - pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py index 373e18eeb1d..f8aef7e6482 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_head_tail.py @@ -24,7 +24,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParityHeadTailTests( GroupbyHeadTailMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): - pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py index 6a0c8416140..307a1182f61 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_index.py @@ -22,7 +22,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParityIndexTests(GroupbyIndexMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py index 75f1ed41d61..a7c2e10dc3f 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_stat.py @@ -22,7 +22,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParityStatTests(GroupbyStatMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - pass diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py b/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py index 44856fa8c4b..7edde505445 100644 --- a/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py +++ b/python/pyspark/pandas/tests/connect/series/test_parity_all_any.py @@ -22,7 +22,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SeriesParityAllAnyTests(SeriesAllAnyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - pass diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py b/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py index 5fa949c4b9a..8fcbe2aa95b 100644 --- a/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py +++ b/python/pyspark/pandas/tests/connect/series/test_parity_as_type.py @@ -22,7 +22,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SeriesParityAsTypeTests(SeriesAsTypeMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - pass diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py b/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py index 08d2773e9da..bf5d0eebdd9 100644 --- a/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py +++ b/python/pyspark/pandas/tests/connect/series/test_parity_conversion.py @@ -24,7 +24,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SeriesParityConversionTests( SeriesConversionMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): - pass diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_sort.py b/python/pyspark/pandas/tests/connect/series/test_parity_sort.py index bf124bc513c..359c19f0567 100644 --- a/python/pyspark/pandas/tests/connect/series/test_parity_sort.py +++ b/python/pyspark/pandas/tests/connect/series/test_parity_sort.py @@ -22,7 +22,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SeriesParitySortTests(SeriesSortMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): - pass diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py index 136366d2252..2870aed8e75 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py @@ -342,7 +342,6 @@ class StringExtensionOpsTest(StringOpsTests): if __name__ == "__main__": - from pyspark.pandas.tests.data_type_ops.test_string_ops import * # noqa: F401 try: diff --git a/python/pyspark/pandas/tests/groupby/test_aggregate.py b/python/pyspark/pandas/tests/groupby/test_aggregate.py index 15ffb9c8f7a..c2d7872c843 100644 --- a/python/pyspark/pandas/tests/groupby/test_aggregate.py +++ b/python/pyspark/pandas/tests/groupby/test_aggregate.py @@ -186,7 +186,6 @@ class GroupbyAggregateMixin: agg_funcs = ["max", "min", ["min", "max"]] for aggfunc in agg_funcs: - # Since in Koalas groupby, the order of rows might be different # so sort on index to ensure they have same output sorted_agg_psdf = psdf.groupby("kind").agg(aggfunc).sort_index() diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py b/python/pyspark/pandas/tests/groupby/test_groupby.py index e1ce8abc8aa..b6cfaedaf67 100644 --- a/python/pyspark/pandas/tests/groupby/test_groupby.py +++ b/python/pyspark/pandas/tests/groupby/test_groupby.py @@ -447,7 +447,6 @@ class GroupByTestsMixin: @staticmethod def test_is_multi_agg_with_relabel(): - assert is_multi_agg_with_relabel(a="max") is False assert is_multi_agg_with_relabel(a_min=("a", "max"), a_max=("a", "min")) is True diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 6dded734b92..5b6bd2cad07 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -344,7 +344,6 @@ class CategoricalIndexTestsMixin: psidxs = [ps.from_pandas(pidx) for pidx in pidxs] for pidx, psidx in zip(pidxs, psidxs): - # Apply dict self.assert_eq( pidx.map({1: "one", 2: "two", 3: "three"}), diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot.py b/python/pyspark/pandas/tests/plot/test_frame_plot.py index 6797a73303f..65c6c57847d 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot.py @@ -50,7 +50,6 @@ class DataFramePlotTestsMixin: getattr(psdf.plot, name)() def test_topn_max_rows(self): - pdf = pd.DataFrame(np.random.rand(2500, 4), columns=["a", "b", "c", "d"]) psdf = ps.from_pandas(pdf) diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index ce60c42d721..1667524910b 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -259,7 +259,6 @@ class DataFrameSparkIOTestsMixin: @unittest.skip("openpyxl") def test_read_excel(self): with self.temp_dir() as tmp: - path1 = "{}/file1.xlsx".format(tmp) self.test_pdf[["i32"]].to_excel(path1) diff --git a/python/pyspark/pandas/tests/test_default_index.py b/python/pyspark/pandas/tests/test_default_index.py index 45ceaf5073a..29e489e81f3 100644 --- a/python/pyspark/pandas/tests/test_default_index.py +++ b/python/pyspark/pandas/tests/test_default_index.py @@ -43,7 +43,6 @@ class DefaultIndexTestsMixin: with ps.option_context( "compute.default_index_type", "distributed-sequence" ), ps.option_context("compute.ops_on_diff_frames", True): - with ps.option_context("compute.default_index_cache", "LOCAL_CHECKPOINT"): cached_rdd_ids = [rdd_id for rdd_id in self.spark._jsc.getPersistentRDDs()] diff --git a/python/pyspark/pandas/tests/test_utils.py b/python/pyspark/pandas/tests/test_utils.py index 60961dcf252..37fba9a9c67 100644 --- a/python/pyspark/pandas/tests/test_utils.py +++ b/python/pyspark/pandas/tests/test_utils.py @@ -38,7 +38,6 @@ some_global_variable = 0 class UtilsTestsMixin: - # a dummy to_html version with an extra parameter that pandas does not support # used in test_validate_arguments_and_invoke_function def to_html(self, max_rows=None, unsupported_param=None): diff --git a/python/pyspark/profiler.py b/python/pyspark/profiler.py index d7990d2fd1b..6aa504c7c08 100644 --- a/python/pyspark/profiler.py +++ b/python/pyspark/profiler.py @@ -430,7 +430,7 @@ class MemoryProfiler(Profiler): stream = sys.stdout template = "{0:>6} {1:>12} {2:>12} {3:>10} {4:<}" - for (filename, lines) in code_map.items(): + for filename, lines in code_map.items(): header = template.format( "Line #", "Mem usage", "Increment", "Occurrences", "Line Contents" ) @@ -443,7 +443,7 @@ class MemoryProfiler(Profiler): float_format = "{0}.{1}f".format(precision + 4, precision) template_mem = "{0:" + float_format + "} MiB" - for (lineno, mem) in lines: + for lineno, mem in lines: total_mem: Union[float, str] inc: Union[float, str] occurrences: Union[float, str] diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 8ea9a310222..0e0d958f46a 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -290,7 +290,6 @@ def _local_iterator_from_socket(sock_info: "JavaArray", serializer: Serializer) # If response is 1 then there is a partition to read, if 0 then fully consumed self._read_status = read_int(self._sockfile) if self._read_status == 1: - # Load the partition data as a stream and read each item self._read_iter = self._serializer.load_stream(self._sockfile) for item in self._read_iter: @@ -3859,7 +3858,6 @@ class RDD(Generic[T_co]): limit = self._memory_limit() / 2 def add_shuffle_key(split: int, iterator: Iterable[Tuple[K, V]]) -> Iterable[bytes]: - buckets = defaultdict(list) c, batch = 0, min(10 * numPartitions, 1000) # type: ignore[operator] diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 80b611429aa..d269d55653c 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -304,7 +304,7 @@ class CartesianDeserializer(Serializer): def _load_stream_without_unbatching(self, stream): key_batch_stream = self.key_ser._load_stream_without_unbatching(stream) val_batch_stream = self.val_ser._load_stream_without_unbatching(stream) - for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream): + for key_batch, val_batch in zip(key_batch_stream, val_batch_stream): # for correctness with repeated cartesian/zip this must be returned as one batch yield product(key_batch, val_batch) @@ -330,7 +330,7 @@ class PairDeserializer(Serializer): def _load_stream_without_unbatching(self, stream): key_batch_stream = self.key_ser._load_stream_without_unbatching(stream) val_batch_stream = self.val_ser._load_stream_without_unbatching(stream) - for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream): + for key_batch, val_batch in zip(key_batch_stream, val_batch_stream): # For double-zipped RDDs, the batches can be iterators from other PairDeserializer, # instead of lists. We need to convert them to lists if needed. key_batch = key_batch if hasattr(key_batch, "__len__") else list(key_batch) diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 9bf3eb3f0ac..ce6ea8ba3ee 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -177,7 +177,6 @@ class ChannelBuilder: # 'spark.local.connect' is set when we use the local mode in Spark Connect. if session is not None and session.conf.get("spark.local.connect", "0") == "1": - jvm = PySparkSession._instantiatedSession._jvm # type: ignore[union-attr] return getattr( getattr( diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py index 1afeb3dfd44..dc46e68f532 100644 --- a/python/pyspark/sql/connect/conversion.py +++ b/python/pyspark/sql/connect/conversion.py @@ -99,7 +99,6 @@ class LocalDataToArrowConversion: return lambda value: None elif isinstance(dataType, StructType): - field_names = dataType.fieldNames() dedup_field_names = _dedup_names(dataType.names) @@ -140,7 +139,6 @@ class LocalDataToArrowConversion: return convert_struct elif isinstance(dataType, ArrayType): - element_conv = LocalDataToArrowConversion._create_converter(dataType.elementType) def convert_array(value: Any) -> Any: @@ -153,7 +151,6 @@ class LocalDataToArrowConversion: return convert_array elif isinstance(dataType, MapType): - key_conv = LocalDataToArrowConversion._create_converter(dataType.keyType) value_conv = LocalDataToArrowConversion._create_converter(dataType.valueType) @@ -260,7 +257,6 @@ class LocalDataToArrowConversion: return convert_udt else: - return lambda value: value @staticmethod @@ -349,7 +345,6 @@ class ArrowTableToRowsConversion: return lambda value: None elif isinstance(dataType, StructType): - field_names = dataType.names dedup_field_names = _dedup_names(field_names) @@ -372,7 +367,6 @@ class ArrowTableToRowsConversion: return convert_struct elif isinstance(dataType, ArrayType): - element_conv = ArrowTableToRowsConversion._create_converter(dataType.elementType) def convert_array(value: Any) -> Any: @@ -385,7 +379,6 @@ class ArrowTableToRowsConversion: return convert_array elif isinstance(dataType, MapType): - key_conv = ArrowTableToRowsConversion._create_converter(dataType.keyType) value_conv = ArrowTableToRowsConversion._create_converter(dataType.valueType) @@ -446,7 +439,6 @@ class ArrowTableToRowsConversion: return convert_udt else: - return lambda value: value @staticmethod diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index 3174661b4bf..797910eba0b 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -879,7 +879,6 @@ class CastExpression(Expression): class UnresolvedNamedLambdaVariable(Expression): - _lock: Lock = Lock() _nextVarNameId: int = 0 diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index f641cb4b2fe..3e8db2aae09 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -55,7 +55,6 @@ if TYPE_CHECKING: class LogicalPlan: - _lock: Lock = Lock() _nextPlanId: int = 0 diff --git a/python/pyspark/sql/connect/proto/base_pb2.py b/python/pyspark/sql/connect/proto/base_pb2.py index fa1868b489c..731f4445e15 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.py +++ b/python/pyspark/sql/connect/proto/base_pb2.py @@ -43,7 +43,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.base_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/base_pb2.pyi b/python/pyspark/sql/connect/proto/base_pb2.pyi index 8fd5fa7a056..3dca29230ef 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.pyi +++ b/python/pyspark/sql/connect/proto/base_pb2.pyi @@ -642,21 +642,24 @@ class AnalyzePlanRequest(google.protobuf.message.Message): @typing.overload def WhichOneof( self, oneof_group: typing_extensions.Literal["analyze", b"analyze"] - ) -> typing_extensions.Literal[ - "schema", - "explain", - "tree_string", - "is_local", - "is_streaming", - "input_files", - "spark_version", - "ddl_parse", - "same_semantics", - "semantic_hash", - "persist", - "unpersist", - "get_storage_level", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "schema", + "explain", + "tree_string", + "is_local", + "is_streaming", + "input_files", + "spark_version", + "ddl_parse", + "same_semantics", + "semantic_hash", + "persist", + "unpersist", + "get_storage_level", + ] + | None + ): ... global___AnalyzePlanRequest = AnalyzePlanRequest @@ -979,21 +982,24 @@ class AnalyzePlanResponse(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["result", b"result"] - ) -> typing_extensions.Literal[ - "schema", - "explain", - "tree_string", - "is_local", - "is_streaming", - "input_files", - "spark_version", - "ddl_parse", - "same_semantics", - "semantic_hash", - "persist", - "unpersist", - "get_storage_level", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "schema", + "explain", + "tree_string", + "is_local", + "is_streaming", + "input_files", + "spark_version", + "ddl_parse", + "same_semantics", + "semantic_hash", + "persist", + "unpersist", + "get_storage_level", + ] + | None + ): ... global___AnalyzePlanResponse = AnalyzePlanResponse @@ -1508,16 +1514,19 @@ class ExecutePlanResponse(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["response_type", b"response_type"] - ) -> typing_extensions.Literal[ - "arrow_batch", - "sql_command_result", - "write_stream_operation_start_result", - "streaming_query_command_result", - "get_resources_command_result", - "streaming_query_manager_command_result", - "result_complete", - "extension", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "arrow_batch", + "sql_command_result", + "write_stream_operation_start_result", + "streaming_query_command_result", + "get_resources_command_result", + "streaming_query_manager_command_result", + "result_complete", + "extension", + ] + | None + ): ... global___ExecutePlanResponse = ExecutePlanResponse @@ -1637,9 +1646,12 @@ class ConfigRequest(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["op_type", b"op_type"] - ) -> typing_extensions.Literal[ - "set", "get", "get_with_default", "get_option", "get_all", "unset", "is_modifiable" - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "set", "get", "get_with_default", "get_option", "get_all", "unset", "is_modifiable" + ] + | None + ): ... class Set(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor diff --git a/python/pyspark/sql/connect/proto/catalog_pb2.py b/python/pyspark/sql/connect/proto/catalog_pb2.py index 709f0f005c7..833d21b7e46 100644 --- a/python/pyspark/sql/connect/proto/catalog_pb2.py +++ b/python/pyspark/sql/connect/proto/catalog_pb2.py @@ -39,7 +39,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.catalog_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/catalog_pb2.pyi b/python/pyspark/sql/connect/proto/catalog_pb2.pyi index 3d14961329b..2879b44618b 100644 --- a/python/pyspark/sql/connect/proto/catalog_pb2.pyi +++ b/python/pyspark/sql/connect/proto/catalog_pb2.pyi @@ -283,34 +283,37 @@ class Catalog(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["cat_type", b"cat_type"] - ) -> typing_extensions.Literal[ - "current_database", - "set_current_database", - "list_databases", - "list_tables", - "list_functions", - "list_columns", - "get_database", - "get_table", - "get_function", - "database_exists", - "table_exists", - "function_exists", - "create_external_table", - "create_table", - "drop_temp_view", - "drop_global_temp_view", - "recover_partitions", - "is_cached", - "cache_table", - "uncache_table", - "clear_cache", - "refresh_table", - "refresh_by_path", - "current_catalog", - "set_current_catalog", - "list_catalogs", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "current_database", + "set_current_database", + "list_databases", + "list_tables", + "list_functions", + "list_columns", + "get_database", + "get_table", + "get_function", + "database_exists", + "table_exists", + "function_exists", + "create_external_table", + "create_table", + "drop_temp_view", + "drop_global_temp_view", + "recover_partitions", + "is_cached", + "cache_table", + "uncache_table", + "clear_cache", + "refresh_table", + "refresh_by_path", + "current_catalog", + "set_current_catalog", + "list_catalogs", + ] + | None + ): ... global___Catalog = Catalog diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index 90911e382bf..503eee0803c 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -41,7 +41,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.commands_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index f3dca7ab4bb..1ee2602c961 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -177,19 +177,22 @@ class Command(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["command_type", b"command_type"] - ) -> typing_extensions.Literal[ - "register_function", - "write_operation", - "create_dataframe_view", - "write_operation_v2", - "sql_command", - "write_stream_operation_start", - "streaming_query_command", - "get_resources_command", - "streaming_query_manager_command", - "register_table_function", - "extension", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "register_function", + "write_operation", + "create_dataframe_view", + "write_operation_v2", + "sql_command", + "write_stream_operation_start", + "streaming_query_command", + "get_resources_command", + "streaming_query_manager_command", + "register_table_function", + "extension", + ] + | None + ): ... global___Command = Command @@ -843,9 +846,12 @@ class WriteStreamOperationStart(google.protobuf.message.Message): @typing.overload def WhichOneof( self, oneof_group: typing_extensions.Literal["trigger", b"trigger"] - ) -> typing_extensions.Literal[ - "processing_time_interval", "available_now", "once", "continuous_checkpoint_interval" - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "processing_time_interval", "available_now", "once", "continuous_checkpoint_interval" + ] + | None + ): ... global___WriteStreamOperationStart = WriteStreamOperationStart @@ -1092,16 +1098,19 @@ class StreamingQueryCommand(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["command", b"command"] - ) -> typing_extensions.Literal[ - "status", - "last_progress", - "recent_progress", - "stop", - "process_all_available", - "explain", - "exception", - "await_termination", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "status", + "last_progress", + "recent_progress", + "stop", + "process_all_available", + "explain", + "exception", + "await_termination", + ] + | None + ): ... global___StreamingQueryCommand = StreamingQueryCommand @@ -1330,9 +1339,12 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["result_type", b"result_type"] - ) -> typing_extensions.Literal[ - "status", "recent_progress", "explain", "exception", "await_termination" - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "status", "recent_progress", "explain", "exception", "await_termination" + ] + | None + ): ... global___StreamingQueryCommandResult = StreamingQueryCommandResult @@ -1503,15 +1515,18 @@ class StreamingQueryManagerCommand(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["command", b"command"] - ) -> typing_extensions.Literal[ - "active", - "get_query", - "await_any_termination", - "reset_terminated", - "add_listener", - "remove_listener", - "list_listeners", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "active", + "get_query", + "await_any_termination", + "reset_terminated", + "add_listener", + "remove_listener", + "list_listeners", + ] + | None + ): ... global___StreamingQueryManagerCommand = StreamingQueryManagerCommand @@ -1695,15 +1710,18 @@ class StreamingQueryManagerCommandResult(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["result_type", b"result_type"] - ) -> typing_extensions.Literal[ - "active", - "query", - "await_any_termination", - "reset_terminated", - "add_listener", - "remove_listener", - "list_listeners", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "active", + "query", + "await_any_termination", + "reset_terminated", + "add_listener", + "remove_listener", + "list_listeners", + ] + | None + ): ... global___StreamingQueryManagerCommandResult = StreamingQueryManagerCommandResult diff --git a/python/pyspark/sql/connect/proto/common_pb2.py b/python/pyspark/sql/connect/proto/common_pb2.py index 9564f634342..7851d410de5 100644 --- a/python/pyspark/sql/connect/proto/common_pb2.py +++ b/python/pyspark/sql/connect/proto/common_pb2.py @@ -35,7 +35,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.common_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/example_plugins_pb2.py b/python/pyspark/sql/connect/proto/example_plugins_pb2.py index 285edd4210e..0059137ecec 100644 --- a/python/pyspark/sql/connect/proto/example_plugins_pb2.py +++ b/python/pyspark/sql/connect/proto/example_plugins_pb2.py @@ -39,7 +39,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.example_plugins_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.py b/python/pyspark/sql/connect/proto/expressions_pb2.py index 6d7a4d3bdcd..13f7f2c75de 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.py +++ b/python/pyspark/sql/connect/proto/expressions_pb2.py @@ -39,7 +39,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.expressions_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.pyi b/python/pyspark/sql/connect/proto/expressions_pb2.pyi index b3b9d62d03f..ba24da263cf 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.pyi +++ b/python/pyspark/sql/connect/proto/expressions_pb2.pyi @@ -718,28 +718,31 @@ class Expression(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["literal_type", b"literal_type"] - ) -> typing_extensions.Literal[ - "null", - "binary", - "boolean", - "byte", - "short", - "integer", - "long", - "float", - "double", - "decimal", - "string", - "date", - "timestamp", - "timestamp_ntz", - "calendar_interval", - "year_month_interval", - "day_time_interval", - "array", - "map", - "struct", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "null", + "binary", + "boolean", + "byte", + "short", + "integer", + "long", + "float", + "double", + "decimal", + "string", + "date", + "timestamp", + "timestamp_ntz", + "calendar_interval", + "year_month_interval", + "day_time_interval", + "array", + "map", + "struct", + ] + | None + ): ... class UnresolvedAttribute(google.protobuf.message.Message): """An unresolved attribute that is not explicitly bound to a specific column, but the column @@ -1318,27 +1321,30 @@ class Expression(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["expr_type", b"expr_type"] - ) -> typing_extensions.Literal[ - "literal", - "unresolved_attribute", - "unresolved_function", - "expression_string", - "unresolved_star", - "alias", - "cast", - "unresolved_regex", - "sort_order", - "lambda_function", - "window", - "unresolved_extract_value", - "update_fields", - "unresolved_named_lambda_variable", - "common_inline_user_defined_function", - "call_function", - "named_argument_expression", - "get_column_by_ordinal", - "extension", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "literal", + "unresolved_attribute", + "unresolved_function", + "expression_string", + "unresolved_star", + "alias", + "cast", + "unresolved_regex", + "sort_order", + "lambda_function", + "window", + "unresolved_extract_value", + "update_fields", + "unresolved_named_lambda_variable", + "common_inline_user_defined_function", + "call_function", + "named_argument_expression", + "get_column_by_ordinal", + "extension", + ] + | None + ): ... global___Expression = Expression diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 3f7e5794937..a97964c3bb9 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -41,7 +41,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.relations_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index 007b92ef5f4..62f96ce3165 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -507,59 +507,62 @@ class Relation(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["rel_type", b"rel_type"] - ) -> typing_extensions.Literal[ - "read", - "project", - "filter", - "join", - "set_op", - "sort", - "limit", - "aggregate", - "sql", - "local_relation", - "sample", - "offset", - "deduplicate", - "range", - "subquery_alias", - "repartition", - "to_df", - "with_columns_renamed", - "show_string", - "drop", - "tail", - "with_columns", - "hint", - "unpivot", - "to_schema", - "repartition_by_expression", - "map_partitions", - "collect_metrics", - "parse", - "group_map", - "co_group_map", - "with_watermark", - "apply_in_pandas_with_state", - "html_string", - "cached_local_relation", - "cached_remote_relation", - "common_inline_user_defined_table_function", - "fill_na", - "drop_na", - "replace", - "summary", - "crosstab", - "describe", - "cov", - "corr", - "approx_quantile", - "freq_items", - "sample_by", - "catalog", - "extension", - "unknown", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "read", + "project", + "filter", + "join", + "set_op", + "sort", + "limit", + "aggregate", + "sql", + "local_relation", + "sample", + "offset", + "deduplicate", + "range", + "subquery_alias", + "repartition", + "to_df", + "with_columns_renamed", + "show_string", + "drop", + "tail", + "with_columns", + "hint", + "unpivot", + "to_schema", + "repartition_by_expression", + "map_partitions", + "collect_metrics", + "parse", + "group_map", + "co_group_map", + "with_watermark", + "apply_in_pandas_with_state", + "html_string", + "cached_local_relation", + "cached_remote_relation", + "common_inline_user_defined_table_function", + "fill_na", + "drop_na", + "replace", + "summary", + "crosstab", + "describe", + "cov", + "corr", + "approx_quantile", + "freq_items", + "sample_by", + "catalog", + "extension", + "unknown", + ] + | None + ): ... global___Relation = Relation diff --git a/python/pyspark/sql/connect/proto/types_pb2.py b/python/pyspark/sql/connect/proto/types_pb2.py index 36197033480..221600868ab 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.py +++ b/python/pyspark/sql/connect/proto/types_pb2.py @@ -35,7 +35,6 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "spark.connect.types_pb2", globals()) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None DESCRIPTOR._serialized_options = ( b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" diff --git a/python/pyspark/sql/connect/proto/types_pb2.pyi b/python/pyspark/sql/connect/proto/types_pb2.pyi index 956701b4c36..21c05f54827 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.pyi +++ b/python/pyspark/sql/connect/proto/types_pb2.pyi @@ -950,31 +950,34 @@ class DataType(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["kind", b"kind"] - ) -> typing_extensions.Literal[ - "null", - "binary", - "boolean", - "byte", - "short", - "integer", - "long", - "float", - "double", - "decimal", - "string", - "char", - "var_char", - "date", - "timestamp", - "timestamp_ntz", - "calendar_interval", - "year_month_interval", - "day_time_interval", - "array", - "struct", - "map", - "udt", - "unparsed", - ] | None: ... + ) -> ( + typing_extensions.Literal[ + "null", + "binary", + "boolean", + "byte", + "short", + "integer", + "long", + "float", + "double", + "decimal", + "string", + "char", + "var_char", + "date", + "timestamp", + "timestamp_ntz", + "calendar_interval", + "year_month_interval", + "day_time_interval", + "array", + "struct", + "map", + "udt", + "unparsed", + ] + | None + ): ... global___DataType = DataType diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 934386ce954..025eed46647 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -811,7 +811,6 @@ class SparkSession: """ session = PySparkSession._instantiatedSession if session is None or session._sc._jsc is None: - # Configurations to be overwritten overwrite_conf = opts overwrite_conf["spark.master"] = master diff --git a/python/pyspark/sql/connect/window.py b/python/pyspark/sql/connect/window.py index ad082c6e265..bab476e4782 100644 --- a/python/pyspark/sql/connect/window.py +++ b/python/pyspark/sql/connect/window.py @@ -70,7 +70,6 @@ class WindowSpec: orderSpec: Sequence[SortOrder], frame: Optional[WindowFrame], ) -> None: - assert isinstance(partitionSpec, list) and all( isinstance(p, Expression) for p in partitionSpec ) diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 817c3b97337..eaec42b1ed0 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -165,7 +165,6 @@ class SQLContext: def _get_or_create( cls: Type["SQLContext"], sc: SparkContext, **static_conf: Any ) -> "SQLContext": - if ( cls._instantiatedContext is None or SQLContext._instantiatedContext._sc._jsc is None # type: ignore[union-attr] diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index cc775de0b79..f37d50f57ab 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -99,7 +99,6 @@ class PandasConversionMixin: require_minimum_pyarrow_version() to_arrow_schema(self.schema) except Exception as e: - if jconf.arrowPySparkFallbackEnabled(): msg = ( "toPandas attempted Arrow optimization because " diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 652129180df..64969a05163 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -415,7 +415,6 @@ def pandas_udf(f=None, returnType=None, functionType=None): PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE, None, ]: # None means it should infer the type from type hints. - raise PySparkTypeError( error_class="INVALID_PANDAS_UDF_TYPE", message_parameters={ diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index 92e2ef1dc44..be07f6e50e1 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -553,7 +553,6 @@ def _create_converter_to_pandas( def _converter( dt: DataType, _struct_in_pandas: Optional[str], _ndarray_as_list: bool ) -> Optional[Callable[[Any], Any]]: - if isinstance(dt, ArrayType): _element_conv = _converter(dt.elementType, _struct_in_pandas, _ndarray_as_list) @@ -649,7 +648,6 @@ def _create_converter_to_pandas( ] if _struct_in_pandas == "row": - if all(conv is None for conv in field_convs): def convert_struct_as_row(value: Any) -> Any: @@ -687,7 +685,6 @@ def _create_converter_to_pandas( return convert_struct_as_row elif _struct_in_pandas == "dict": - if all(conv is None for conv in field_convs): def convert_struct_as_dict(value: Any) -> Any: @@ -829,7 +826,6 @@ def _create_converter_from_pandas( return correct_timestamp def _converter(dt: DataType) -> Optional[Callable[[Any], Any]]: - if isinstance(dt, ArrayType): _element_conv = _converter(dt.elementType) @@ -875,7 +871,6 @@ def _create_converter_from_pandas( _value_conv = _converter(dt.valueType) if ignore_unexpected_complex_type_values: - if _key_conv is None and _value_conv is None: def convert_map(value: Any) -> Any: @@ -901,7 +896,6 @@ def _create_converter_from_pandas( return value else: - if _key_conv is None and _value_conv is None: def convert_map(value: Any) -> Any: @@ -923,7 +917,6 @@ def _create_converter_from_pandas( return convert_map elif isinstance(dt, StructType): - field_names = dt.names if error_on_duplicated_field_names and len(set(field_names)) != len(field_names): @@ -937,7 +930,6 @@ def _create_converter_from_pandas( field_convs = [_converter(f.dataType) for f in dt.fields] if ignore_unexpected_complex_type_values: - if all(conv is None for conv in field_convs): def convert_struct(value: Any) -> Any: diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index d3f3359acea..1895bf32ccf 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -273,7 +273,7 @@ class SparkSession(SparkConversionMixin): """ with self._lock: if conf is not None: - for (k, v) in conf.getAll(): + for k, v in conf.getAll(): self._validate_startup_urls() self._options[k] = v elif map is not None: diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 46b66537299..f5aaf535de7 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -728,7 +728,6 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, PandasOnSparkTestUtils, S (CF.ntile(2), SF.ntile(2)), (CF.ntile(4), SF.ntile(4)), ]: - for cwin, swin in [ (CW.orderBy("b"), SW.orderBy("b")), (CW.partitionBy("a").orderBy("b"), SW.partitionBy("a").orderBy("b")), @@ -743,7 +742,6 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, PandasOnSparkTestUtils, S SW.partitionBy("a").orderBy("b", sdf.c.desc()), ), ]: - self.assert_eq( cdf.select(ccol.over(cwin)).toPandas(), sdf.select(scol.over(swin)).toPandas(), @@ -756,7 +754,6 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, PandasOnSparkTestUtils, S (CF.max(cdf.c), SF.max(sdf.c)), (CF.min(cdf.c), SF.min(sdf.c)), ]: - for cwin, swin in [ (CW.orderBy("b"), SW.orderBy("b")), ( @@ -857,7 +854,6 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, PandasOnSparkTestUtils, S .rangeBetween(SW.currentRow, SW.unboundedFollowing), ), ]: - self.assert_eq( cdf.select(ccol.over(cwin)).toPandas(), sdf.select(scol.over(swin)).toPandas(), diff --git a/python/pyspark/sql/tests/connect/test_connect_plan.py b/python/pyspark/sql/tests/connect/test_connect_plan.py index c39fb6be24c..bd1c6e03715 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan.py @@ -782,7 +782,7 @@ class SparkConnectPlanTests(PlanOnlyTestFixture): def test_join_with_join_type(self): df_left = self.connect.with_plan(Read("table")) df_right = self.connect.with_plan(Read("table")) - for (join_type_str, join_type) in [ + for join_type_str, join_type in [ (None, proto.Join.JoinType.JOIN_TYPE_INNER), ("inner", proto.Join.JoinType.JOIN_TYPE_INNER), ("outer", proto.Join.JoinType.JOIN_TYPE_FULL_OUTER), @@ -844,7 +844,6 @@ class SparkConnectPlanTests(PlanOnlyTestFixture): self.assertEqual(bin_lit_p.literal.binary, val) def test_uuid_literal(self): - val = uuid.uuid4() with self.assertRaises(TypeError): lit(val) diff --git a/python/pyspark/sql/tests/connect/test_session.py b/python/pyspark/sql/tests/connect/test_session.py index 365468c8608..1409e861c6b 100644 --- a/python/pyspark/sql/tests/connect/test_session.py +++ b/python/pyspark/sql/tests/connect/test_session.py @@ -116,7 +116,6 @@ class JobCancellationTests(ReusedConnectTestCase): def check_job_cancellation( self, setter, canceller, thread_ids, thread_ids_to_cancel, thread_ids_to_run ): - job_id_a = "job_ids_to_cancel" job_id_b = "job_ids_to_run" threads = [] diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py index 742b3657f6e..38650b972ea 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py @@ -85,7 +85,6 @@ class GroupedApplyInPandasTestsMixin: ) def test_supported_types(self): - values = [ 1, 2, @@ -557,7 +556,6 @@ class GroupedApplyInPandasTestsMixin: self.check_column_order() def check_column_order(self): - # Helper function to set column names from a list def rename_pdf(pdf, names): pdf.rename( @@ -692,7 +690,6 @@ class GroupedApplyInPandasTestsMixin: self.assertEqual(result, expected) def test_grouped_over_window(self): - data = [ (0, 1, "2018-03-10T00:00:00+00:00", [0]), (1, 2, "2018-03-11T00:00:00+00:00", [0]), @@ -724,7 +721,6 @@ class GroupedApplyInPandasTestsMixin: self.assertListEqual([Row(id=key, result=val) for key, val in expected.items()], result) def test_grouped_over_window_with_key(self): - data = [ (0, 1, "2018-03-10T00:00:00+00:00", [0]), (1, 2, "2018-03-11T00:00:00+00:00", [0]), diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py index 0c5347dd986..5041fefff19 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py @@ -230,7 +230,6 @@ class StreamingTestsForeachMixin: # TODO: Verify whether original error message is inside the exception def test_streaming_foreach_with_invalid_writers(self): - tester = self.ForeachWriterTester(self.spark) def func_with_iterator_input(iter): diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 2ffd4f312ff..150130f3c1a 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -1181,7 +1181,6 @@ class DataFrameTestsMixin: # Cartesian products require cross join syntax def test_require_cross(self): - df1 = self.spark.createDataFrame([(1, "1")], ("key", "value")) df2 = self.spark.createDataFrame([(1, "1")], ("key", "value")) diff --git a/python/pyspark/sql/tests/test_session.py b/python/pyspark/sql/tests/test_session.py index dacaff4d2d2..706b041bb51 100644 --- a/python/pyspark/sql/tests/test_session.py +++ b/python/pyspark/sql/tests/test_session.py @@ -33,7 +33,6 @@ class SparkSessionTests(ReusedSQLTestCase): class SparkSessionTests1(ReusedSQLTestCase): - # We can't include this test into SQLTests because we will stop class's SparkContext and cause # other tests failed. def test_sparksession_with_stopped_sparkcontext(self): @@ -49,7 +48,6 @@ class SparkSessionTests1(ReusedSQLTestCase): class SparkSessionTests2(PySparkTestCase): - # This test is separate because it's closely related with session's start and stop. # See SPARK-23228. def test_set_jvm_default_session(self): diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 45df4433916..f584440e121 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -184,7 +184,6 @@ def try_remote_functions(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect import functions @@ -200,7 +199,6 @@ def try_remote_avro_functions(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect.avro import functions @@ -216,7 +214,6 @@ def try_remote_protobuf_functions(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect.protobuf import functions @@ -232,7 +229,6 @@ def try_remote_window(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect.window import Window # type: ignore[misc] @@ -248,7 +244,6 @@ def try_remote_windowspec(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect.window import WindowSpec @@ -289,7 +284,6 @@ def try_remote_session_classmethod(f: FuncT) -> FuncT: @functools.wraps(f) def wrapped(*args: Any, **kwargs: Any) -> Any: - if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ: from pyspark.sql.connect.session import SparkSession # type: ignore[misc] diff --git a/python/pyspark/streaming/tests/test_context.py b/python/pyspark/streaming/tests/test_context.py index 1afcc90b9e6..f3fc501a57f 100644 --- a/python/pyspark/streaming/tests/test_context.py +++ b/python/pyspark/streaming/tests/test_context.py @@ -24,7 +24,6 @@ from pyspark.testing.streamingutils import PySparkStreamingTestCase class StreamingContextTests(PySparkStreamingTestCase): - duration = 0.1 setupCalled = False diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py index d37e64affb5..046247763c0 100644 --- a/python/pyspark/streaming/tests/test_dstream.py +++ b/python/pyspark/streaming/tests/test_dstream.py @@ -435,7 +435,6 @@ class BasicOperationTests(PySparkStreamingTestCase): "The tests fail in PyPy3 implementation for an unknown reason.", ) class WindowFunctionTests(PySparkStreamingTestCase): - timeout = 15 def test_window(self): @@ -528,7 +527,6 @@ class WindowFunctionTests(PySparkStreamingTestCase): "The tests fail in PyPy3 implementation for an unknown reason.", ) class CheckpointTests(unittest.TestCase): - setupCalled = False @staticmethod diff --git a/python/pyspark/streaming/tests/test_listener.py b/python/pyspark/streaming/tests/test_listener.py index aeec278b38a..7769f3bedaa 100644 --- a/python/pyspark/streaming/tests/test_listener.py +++ b/python/pyspark/streaming/tests/test_listener.py @@ -19,7 +19,6 @@ from pyspark.testing.streamingutils import PySparkStreamingTestCase class StreamingListenerTests(PySparkStreamingTestCase): - duration = 0.5 class BatchInfoCollector(StreamingListener): diff --git a/python/pyspark/testing/mlutils.py b/python/pyspark/testing/mlutils.py index 503ba7c7696..8981e97ea49 100644 --- a/python/pyspark/testing/mlutils.py +++ b/python/pyspark/testing/mlutils.py @@ -124,7 +124,6 @@ class MockTransformer(Transformer, HasFake): class MockUnaryTransformer(UnaryTransformer, DefaultParamsReadable, DefaultParamsWritable): - shift = Param( Params._dummy(), "shift", diff --git a/python/pyspark/testing/pandasutils.py b/python/pyspark/testing/pandasutils.py index d04f4702851..3cc6b6a8f59 100644 --- a/python/pyspark/testing/pandasutils.py +++ b/python/pyspark/testing/pandasutils.py @@ -560,7 +560,6 @@ class ComparisonTestBase(PandasOnSparkTestCase): def compare_both(f=None, almost=True): - if f is None: return functools.partial(compare_both, almost=almost) elif isinstance(f, bool): @@ -644,7 +643,6 @@ def assert_produces_warning( __tracebackhide__ = True with warnings.catch_warnings(record=True) as w: - saw_warning = False warnings.simplefilter(filter_level) yield w diff --git a/python/pyspark/testing/streamingutils.py b/python/pyspark/testing/streamingutils.py index 57f27cec7ab..dba60b50fcc 100644 --- a/python/pyspark/testing/streamingutils.py +++ b/python/pyspark/testing/streamingutils.py @@ -57,7 +57,6 @@ should_test_kinesis = kinesis_requirement_message is None class PySparkStreamingTestCase(unittest.TestCase): - timeout = 30 # seconds duration = 0.5 diff --git a/python/pyspark/tests/test_install_spark.py b/python/pyspark/tests/test_install_spark.py index e980a17673f..c4188034a10 100644 --- a/python/pyspark/tests/test_install_spark.py +++ b/python/pyspark/tests/test_install_spark.py @@ -83,7 +83,7 @@ class SparkInstallationTestCase(unittest.TestCase): ) # Negative test cases - for (hadoop_version, hive_version) in UNSUPPORTED_COMBINATIONS: + for hadoop_version, hive_version in UNSUPPORTED_COMBINATIONS: with self.assertRaisesRegex(RuntimeError, "Hive.*should.*Hadoop"): checked_versions( spark_version=test_version, diff --git a/python/pyspark/tests/test_pin_thread.py b/python/pyspark/tests/test_pin_thread.py index 975b5498089..17b7167472d 100644 --- a/python/pyspark/tests/test_pin_thread.py +++ b/python/pyspark/tests/test_pin_thread.py @@ -99,7 +99,6 @@ class PinThreadTests(unittest.TestCase): ) def check_job_cancellation(self, setter, canceller): - job_id_a = "job_ids_to_cancel" job_id_b = "job_ids_to_run" diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index eea6e8fa783..026c07fb998 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -983,7 +983,6 @@ def read_udfs(pickleSer, infile, eval_type): PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE, ): - # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): diff --git a/python/pyspark/worker_util.py b/python/pyspark/worker_util.py index 722713b6f54..9ee18758ce4 100644 --- a/python/pyspark/worker_util.py +++ b/python/pyspark/worker_util.py @@ -176,5 +176,5 @@ def send_accumulator_updates(outfile: IO) -> None: Send the accumulator updates back to JVM. """ write_int(len(_accumulatorRegistry), outfile) - for (aid, accum) in _accumulatorRegistry.items(): + for aid, accum in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org