This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 7323bae6df1 [MINOR][DOC] Fix typo under python directory 7323bae6df1 is described below commit 7323bae6df1def2cfcf9509baf699ca6d0ba20f5 Author: Kazuaki Ishizaki <ishiz...@jp.ibm.com> AuthorDate: Sun Dec 4 20:29:38 2022 +0900 [MINOR][DOC] Fix typo under python directory ### What changes were proposed in this pull request? Fix typo in pydoc and messages under `python` directory ### Why are the changes needed? Better documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A Closes #38900 from kiszk/typo-pydocs. Authored-by: Kazuaki Ishizaki <ishiz...@jp.ibm.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/ml/tuning.py | 2 +- python/pyspark/pandas/frame.py | 2 +- python/pyspark/pandas/groupby.py | 4 ++-- python/pyspark/pandas/spark/accessors.py | 2 +- python/pyspark/pandas/supported_api_gen.py | 10 +++++----- python/pyspark/rdd.py | 4 ++-- python/pyspark/sql/catalog.py | 4 ++-- python/pyspark/sql/connect/function_builder.py | 2 +- python/pyspark/sql/dataframe.py | 6 +++--- python/pyspark/sql/functions.py | 4 ++-- python/pyspark/sql/session.py | 2 +- python/pyspark/sql/streaming/query.py | 4 ++-- python/pyspark/sql/streaming/readwriter.py | 4 ++-- 13 files changed, 25 insertions(+), 25 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 44a8b51ef8e..0dabcdd7f27 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -405,7 +405,7 @@ class _ValidatorSharedReadWrite: elif isinstance(v, MLWritable): raise RuntimeError( "ValidatorSharedReadWrite.saveImpl does not handle parameters of type: " - "MLWritable that are not Estimaor/Evaluator/Transformer, and if parameter " + "MLWritable that are not Estimator/Evaluator/Transformer, and if parameter " "is estimator, it cannot be meta estimator such as Validator or OneVsRest" ) else: diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 835c13d6fdd..f044634da0b 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -382,7 +382,7 @@ class DataFrame(Frame, Generic[T]): .. versionchanged:: 3.4.0 Since 3.4.0, it deals with `data` and `index` in this approach: 1, when `data` is a distributed dataset (Internal DataFrame/Spark DataFrame/ - pandas-on-Spark DataFrame/pandas-on-Spark Series), it will first parallize + pandas-on-Spark DataFrame/pandas-on-Spark Series), it will first parallelize the `index` if necessary, and then try to combine the `data` and `index`; Note that if `data` and `index` doesn't have the same anchor, then `compute.ops_on_diff_frames` should be turned on; diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index c5dbcb79710..baa5f0ae146 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -1989,7 +1989,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): if should_infer_schema: # Here we execute with the first 1000 to get the return type. log_advice( - "If the type hints is not specified for `grouby.apply`, " + "If the type hints is not specified for `groupby.apply`, " "it is expensive to infer the data type internally." ) limit = get_option("compute.shortcut_limit") @@ -3107,7 +3107,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( - "If the type hints is not specified for `grouby.transform`, " + "If the type hints is not specified for `groupby.transform`, " "it is expensive to infer the data type internally." ) limit = get_option("compute.shortcut_limit") diff --git a/python/pyspark/pandas/spark/accessors.py b/python/pyspark/pandas/spark/accessors.py index c8e7f507c6f..4e1caa2432b 100644 --- a/python/pyspark/pandas/spark/accessors.py +++ b/python/pyspark/pandas/spark/accessors.py @@ -144,7 +144,7 @@ class SparkSeriesMethods(SparkIndexOpsMethods["ps.Series"]): .. note:: It forces to lose the index and end up with using default index. It is preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply` - with specifying the `inedx_col`. + with specifying the `index_col`. .. note:: It does not require to have the same length of the input and output. However, it requires to create a new DataFrame internally which will require diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index e60cba204f9..301e6a2f9b7 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -171,8 +171,8 @@ def _organize_by_implementation_status( """ Check the implementation status and parameters of both modules. - Parmeters - --------- + Parameters + ---------- module_name : str Class name that exists in the path of the module. pd_funcs: Dict[str, Callable] @@ -284,11 +284,11 @@ def _update_all_supported_status( """ Updates supported status across multiple module paths. - Parmeters - --------- + Parameters + ---------- all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]] Data that stores the supported status across multiple module paths. - pd_modles: List[str] + pd_modules: List[str] Name list of pandas modules. pd_module_group : Any Specific path of importable pandas module. diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 7f5e4e603f4..829f3d08b63 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -4289,7 +4289,7 @@ class RDD(Generic[T_co]): Returns ------- :class:`RDD` - a :class:`RDD` containing the keys and cogouped values + a :class:`RDD` containing the keys and cogrouped values See Also -------- @@ -4330,7 +4330,7 @@ class RDD(Generic[T_co]): Returns ------- :class:`RDD` - a :class:`RDD` containing the keys and cogouped values + a :class:`RDD` containing the keys and cogrouped values See Also -------- diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 4a49ef1fa04..6b97aa1db48 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -803,7 +803,7 @@ class Catalog: -------- >>> spark.createDataFrame([(1, 1)]).createTempView("my_table") - Droppping the temporary view. + Dropping the temporary view. >>> spark.catalog.dropTempView("my_table") True @@ -840,7 +840,7 @@ class Catalog: -------- >>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table") - Droppping the global view. + Dropping the global view. >>> spark.catalog.dropGlobalTempView("my_table") True diff --git a/python/pyspark/sql/connect/function_builder.py b/python/pyspark/sql/connect/function_builder.py index 1b3450786a3..3c59312888a 100644 --- a/python/pyspark/sql/connect/function_builder.py +++ b/python/pyspark/sql/connect/function_builder.py @@ -64,7 +64,7 @@ functions = FunctionBuilder() class UserDefinedFunction(Expression): - """A user defied function is an expresison that has a reference to the actual + """A user defied function is an expression that has a reference to the actual Python callable attached. During plan generation, the client sends a command to the server to register the UDF before execution. The expression object can be reused and is not attached to a specific execution. If the internal name of diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f3873e3c8cd..bd5df07a50f 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2375,7 +2375,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 2|Alice| +---+-----+ - Specify miltiple columns + Specify multiple columns >>> df = spark.createDataFrame([ ... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]) @@ -2388,7 +2388,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 2| Bob| +---+-----+ - Specify miltiple columns for sorting order at `ascending`. + Specify multiple columns for sorting order at `ascending`. >>> df.orderBy(["age", "name"], ascending=[False, False]).show() +---+-----+ @@ -3209,7 +3209,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Parameters ---------- exprs : :class:`Column` or dict of key and value strings - Columns or expressions to aggreate DataFrame by. + Columns or expressions to aggregate DataFrame by. Returns ------- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 3aeb48adea7..9746196dc94 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5517,7 +5517,7 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: Parameters ---------- sep : str - words seperator. + words separator. cols : :class:`~pyspark.sql.Column` or str list of columns to work on. @@ -7916,7 +7916,7 @@ def arrays_zip(*cols: "ColumnOrName") -> Column: """ Collection function: Returns a merged array of structs in which the N-th struct contains all N-th values of input arrays. If one of the arrays is shorter than others then - resulting struct type value will be a `null` for missing elemets. + resulting struct type value will be a `null` for missing elements. .. versionadded:: 2.4.0 diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index eec3246cac3..ebad3224f02 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -1393,7 +1393,7 @@ class SparkSession(SparkConversionMixin): >>> spark.readStream <pyspark.sql.streaming.readwriter.DataStreamReader object ...> - The example below uses Rate source that generates rows continously. + The example below uses Rate source that generates rows continuously. After that, we operate a modulo by 3, and then write the stream out to the console. The streaming query stops in 3 seconds. diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index 5ee7a4790b0..c1f0e734800 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -183,7 +183,7 @@ class StreamingQuery: >>> sdf = spark.readStream.format("rate").load() >>> sq = sdf.writeStream.format('memory').queryName('query_awaitTermination').start() - Return wheter the query has terminated or not within 5 seconds + Return whether the query has terminated or not within 5 seconds >>> sq.awaitTermination(5) False @@ -517,7 +517,7 @@ class StreamingQueryManager: >>> sdf = spark.readStream.format("rate").load() >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() - Return wheter any of the query on the associated SparkSession + Return whether any of the query on the associated SparkSession has terminated or not within 5 seconds >>> spark.streams.awaitAnyTermination(5) diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index ef3b7e525e3..c58848dc508 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -52,7 +52,7 @@ class DataStreamReader(OptionUtils): >>> spark.readStream <pyspark.sql.streaming.readwriter.DataStreamReader object ...> - The example below uses Rate source that generates rows continously. + The example below uses Rate source that generates rows continuously. After that, we operate a modulo by 3, and then writes the stream out to the console. The streaming query stops in 3 seconds. @@ -721,7 +721,7 @@ class DataStreamWriter: Examples -------- - The example below uses Rate source that generates rows continously. + The example below uses Rate source that generates rows continuously. After that, we operate a modulo by 3, and then writes the stream out to the console. The streaming query stops in 3 seconds. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org