This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 462d4565cd2 [SPARK-43111][PS][CONNECT][PYTHON] Merge nested `if` statements into single `if` statements 462d4565cd2 is described below commit 462d4565cd2782fe805c9871eeab2d969c79369f Author: Bjørn Jørgensen <bjornjorgen...@gmail.com> AuthorDate: Tue Apr 18 13:13:10 2023 +0900 [SPARK-43111][PS][CONNECT][PYTHON] Merge nested `if` statements into single `if` statements ### What changes were proposed in this pull request? This PR aims to simplify the code by merging nested `if` statements into single `if` statements using the `and` operator. There are 7 of these according to [Sonarcloud](https://sonarcloud.io/project/issues?languages=py&resolved=false&rules=python%3AS1066&id=spark-python&open=AYQdnXXBRrJbVxW9ZDpw). And this PR fix them all. ### Why are the changes needed? The changes do not affect the functionality of the code, but they improve readability and maintainability. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. Closes #40759 from bjornjorgensen/Merge-if-with-the-enclosing-one. Lead-authored-by: Bjørn Jørgensen <bjornjorgen...@gmail.com> Co-authored-by: bjornjorgensen <bjornjorgen...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/accumulators.py | 5 ++--- python/pyspark/pandas/frame.py | 17 ++++++++++------- python/pyspark/pandas/groupby.py | 17 ++++++++--------- python/pyspark/pandas/indexes/base.py | 9 ++++----- python/pyspark/pandas/namespace.py | 5 ++--- python/pyspark/sql/connect/streaming/readwriter.py | 11 +++++------ 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index fe775a37ed8..ce4bb561814 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -249,9 +249,8 @@ class _UpdateRequestHandler(SocketServer.StreamRequestHandler): while not self.server.server_shutdown: # type: ignore[attr-defined] # Poll every 1 second for new data -- don't block in case of shutdown. r, _, _ = select.select([self.rfile], [], [], 1) - if self.rfile in r: - if func(): - break + if self.rfile in r and func(): + break def accum_updates() -> bool: num_updates = read_int(self.rfile) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 8bddcb6bae8..d1c10223432 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -8915,15 +8915,19 @@ defaultdict(<class 'list'>, {'col..., 'col...})] if len(index_scols) != other._internal.index_level: raise ValueError("Both DataFrames have to have the same number of index levels") - if verify_integrity and len(index_scols) > 0: - if ( + if ( + verify_integrity + and len(index_scols) > 0 + and ( self._internal.spark_frame.select(index_scols) .intersect( other._internal.spark_frame.select(other._internal.index_spark_columns) ) .count() - ) > 0: - raise ValueError("Indices have overlapping values") + ) + > 0 + ): + raise ValueError("Indices have overlapping values") # Lazy import to avoid circular dependency issues from pyspark.pandas.namespace import concat @@ -11581,9 +11585,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})] index_columns = psdf._internal.index_spark_column_names num_indices = len(index_columns) - if level: - if level < 0 or level >= num_indices: - raise ValueError("level should be an integer between [0, %s)" % num_indices) + if level is not None and (level < 0 or level >= num_indices): + raise ValueError("level should be an integer between [0, %s)" % num_indices) @pandas_udf(returnType=index_mapper_ret_stype) # type: ignore[call-overload] def index_mapper_udf(s: pd.Series) -> pd.Series: diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 01687c3fd16..01bc72cd809 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -3550,15 +3550,14 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): if isinstance(self, SeriesGroupBy): raise TypeError("Only numeric aggregation column is accepted.") - if not numeric_only: - if has_non_numeric: - warnings.warn( - "Dropping invalid columns in DataFrameGroupBy.%s is deprecated. " - "In a future version, a TypeError will be raised. " - "Before calling .%s, select only columns which should be " - "valid for the function." % (function_name, function_name), - FutureWarning, - ) + if not numeric_only and has_non_numeric: + warnings.warn( + "Dropping invalid columns in DataFrameGroupBy.%s is deprecated. " + "In a future version, a TypeError will be raised. " + "Before calling .%s, select only columns which should be " + "valid for the function." % (function_name, function_name), + FutureWarning, + ) def _reduce_for_stat_function( self, diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index 4e8de350998..e38cf267720 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -2095,11 +2095,10 @@ class Index(IndexOpsMixin): """ from pyspark.pandas.indexes.multi import MultiIndex - if isinstance(self, MultiIndex): - if level is not None: - self_names = self.names - self_names[level] = names # type: ignore[index] - names = self_names + if isinstance(self, MultiIndex) and level is not None: + self_names = self.names + self_names[level] = names # type: ignore[index] + names = self_names return self.rename(name=names, inplace=inplace) def difference(self, other: "Index", sort: Optional[bool] = None) -> "Index": diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index fdd9f86e402..b0a6073b813 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -2190,9 +2190,8 @@ def get_dummies( if sparse is not False: raise NotImplementedError("get_dummies currently does not support sparse") - if columns is not None: - if not is_list_like(columns): - raise TypeError("Input must be a list-like for parameter `columns`") + if columns is not None and not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") if dtype is None: dtype = "byte" diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py index eb78c374965..0775c1ab4a3 100644 --- a/python/pyspark/sql/connect/streaming/readwriter.py +++ b/python/pyspark/sql/connect/streaming/readwriter.py @@ -94,12 +94,11 @@ class DataStreamReader(OptionUtils): if schema is not None: self.schema(schema) self.options(**options) - if path is not None: - if type(path) != str or len(path.strip()) == 0: - raise ValueError( - "If the path is provided for stream, it needs to be a " - + "non-empty string. List of paths are not supported." - ) + if path is not None and (type(path) != str or len(path.strip()) == 0): + raise ValueError( + "If the path is provided for stream, it needs to be a " + + "non-empty string. List of paths are not supported." + ) plan = DataSource( format=self._format, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org