This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new a9db96e0d51 [SPARK-44928][PYTHON][DOCS][3.5] Replace the module alias 'sf' instead of 'F' in pyspark.sql import functions a9db96e0d51 is described below commit a9db96e0d51a4f746a802c148c70ddb2bfe06ee0 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Thu Aug 24 14:57:55 2023 +0800 [SPARK-44928][PYTHON][DOCS][3.5] Replace the module alias 'sf' instead of 'F' in pyspark.sql import functions ### What changes were proposed in this pull request? cherry-pick https://github.com/apache/spark/pull/42628 for 3.5 ### Why are the changes needed? for better doc ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? NO Closes #42640 from zhengruifeng/replace_F_35. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- docs/quick-start.md | 8 +++--- docs/structured-streaming-programming-guide.md | 4 +-- python/docs/source/development/debugging.rst | 4 +-- .../source/user_guide/pandas_on_spark/options.rst | 10 ++++---- python/pyspark/pandas/namespace.py | 5 ++-- python/pyspark/pandas/utils.py | 7 ++--- python/pyspark/sql/column.py | 8 +++--- python/pyspark/sql/dataframe.py | 10 ++++---- python/pyspark/sql/functions.py | 30 +++++++++++----------- python/pyspark/sql/group.py | 4 +-- 10 files changed, 46 insertions(+), 44 deletions(-) diff --git a/docs/quick-start.md b/docs/quick-start.md index 91b23851f72..cab541a0351 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -130,8 +130,8 @@ Dataset actions and transformations can be used for more complex computations. L <div data-lang="python" markdown="1"> {% highlight python %} ->>> from pyspark.sql.functions import * ->>> textFile.select(size(split(textFile.value, "\s+")).name("numWords")).agg(max(col("numWords"))).collect() +>>> from pyspark.sql import functions as sf +>>> textFile.select(sf.size(sf.split(textFile.value, "\s+")).name("numWords")).agg(sf.max(sf.col("numWords"))).collect() [Row(max(numWords)=15)] {% endhighlight %} @@ -140,7 +140,7 @@ This first maps a line to an integer value and aliases it as "numWords", creatin One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can implement MapReduce flows easily: {% highlight python %} ->>> wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count() +>>> wordCounts = textFile.select(sf.explode(sf.split(textFile.value, "\s+")).alias("word")).groupBy("word").count() {% endhighlight %} Here, we use the `explode` function in `select`, to transform a Dataset of lines to a Dataset of words, and then combine `groupBy` and `count` to compute the per-word counts in the file as a DataFrame of 2 columns: "word" and "count". To collect the word counts in our shell, we can call `collect`: @@ -313,7 +313,7 @@ named `SimpleApp.scala`: import org.apache.spark.sql.SparkSession object SimpleApp { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val logFile = "YOUR_SPARK_HOME/README.md" // Should be some file on your system val spark = SparkSession.builder.appName("Simple Application").getOrCreate() val logData = spark.read.textFile(logFile).cache() diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index dc25adbdfd3..76a22621a0e 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -1215,12 +1215,12 @@ event start time and evaluated gap duration during the query execution. <div data-lang="python" markdown="1"> {% highlight python %} -from pyspark.sql import functions as F +from pyspark.sql import functions as sf events = ... # streaming DataFrame of schema { timestamp: Timestamp, userId: String } session_window = session_window(events.timestamp, \ - F.when(events.userId == "user1", "5 seconds") \ + sf.when(events.userId == "user1", "5 seconds") \ .when(events.userId == "user2", "20 seconds").otherwise("5 minutes")) # Group the data by session window and userId, and compute the count of each group diff --git a/python/docs/source/development/debugging.rst b/python/docs/source/development/debugging.rst index a188d3f3e78..ef848a6e961 100644 --- a/python/docs/source/development/debugging.rst +++ b/python/docs/source/development/debugging.rst @@ -475,10 +475,10 @@ Example: .. code-block:: python - >>> import pyspark.sql.functions as F + >>> import pyspark.sql.functions as sf >>> from pyspark.sql.functions import udf >>> def f(x): - ... return F.abs(x) + ... return sf.abs(x) ... >>> spark.range(-1, 1).withColumn("abs", udf(f)("id")).collect() 22/04/12 14:52:31 ERROR Executor: Exception in task 7.0 in stage 37.0 (TID 232) diff --git a/python/docs/source/user_guide/pandas_on_spark/options.rst b/python/docs/source/user_guide/pandas_on_spark/options.rst index 3f99e059431..92b572b9d80 100644 --- a/python/docs/source/user_guide/pandas_on_spark/options.rst +++ b/python/docs/source/user_guide/pandas_on_spark/options.rst @@ -175,11 +175,11 @@ This is conceptually equivalent to the PySpark example as below: .. code-block:: python - >>> from pyspark.sql import functions as F, Window + >>> from pyspark.sql import functions as sf, Window >>> import pyspark.pandas as ps >>> spark_df = ps.range(3).to_spark() - >>> sequential_index = F.row_number().over( - ... Window.orderBy(F.monotonically_increasing_id().asc())) - 1 + >>> sequential_index = sf.row_number().over( + ... Window.orderBy(sf.monotonically_increasing_id().asc())) - 1 >>> spark_df.select(sequential_index).rdd.map(lambda r: r[0]).collect() [0, 1, 2] @@ -225,10 +225,10 @@ This is conceptually equivalent to the PySpark example as below: .. code-block:: python - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> import pyspark.pandas as ps >>> spark_df = ps.range(3).to_spark() - >>> spark_df.select(F.monotonically_increasing_id()) \ + >>> spark_df.select(sf.monotonically_increasing_id()) \ ... .rdd.map(lambda r: r[0]).collect() [25769803776, 60129542144, 94489280512] diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 3563a6d81b4..5689471eb2f 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -3278,7 +3278,7 @@ def merge_asof( ... quotes, ... on="time", ... by="ticker", - ... tolerance=F.expr("INTERVAL 2 MILLISECONDS") # pd.Timedelta("2ms") + ... tolerance=sf.expr("INTERVAL 2 MILLISECONDS") # pd.Timedelta("2ms") ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 @@ -3296,7 +3296,7 @@ def merge_asof( ... quotes, ... on="time", ... by="ticker", - ... tolerance=F.expr("INTERVAL 10 MILLISECONDS"), # pd.Timedelta("10ms") + ... tolerance=sf.expr("INTERVAL 10 MILLISECONDS"), # pd.Timedelta("10ms") ... allow_exact_matches=False ... ).sort_values(["time", "ticker", "price"]).reset_index(drop=True) time ticker price quantity bid ask @@ -3776,6 +3776,7 @@ def _test() -> None: globs = pyspark.pandas.namespace.__dict__.copy() globs["ps"] = pyspark.pandas + globs["sf"] = F spark = ( SparkSession.builder.master("local[4]") .appName("pyspark.pandas.namespace tests") diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index 55b9a57ef61..ebeb1d69d1b 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -928,11 +928,11 @@ def spark_column_equals(left: Column, right: Column) -> bool: """ Check both `left` and `right` have the same expressions. - >>> spark_column_equals(F.lit(0), F.lit(0)) + >>> spark_column_equals(sf.lit(0), sf.lit(0)) True - >>> spark_column_equals(F.lit(0) + 1, F.lit(0) + 1) + >>> spark_column_equals(sf.lit(0) + 1, sf.lit(0) + 1) True - >>> spark_column_equals(F.lit(0) + 1, F.lit(0) + 2) + >>> spark_column_equals(sf.lit(0) + 1, sf.lit(0) + 2) False >>> sdf1 = ps.DataFrame({"x": ['a', 'b', 'c']}).to_spark() >>> spark_column_equals(sdf1["x"] + 1, sdf1["x"] + 1) @@ -1045,6 +1045,7 @@ def _test() -> None: globs = pyspark.pandas.utils.__dict__.copy() globs["ps"] = pyspark.pandas + globs["sf"] = F spark = ( SparkSession.builder.master("local[4]").appName("pyspark.pandas.utils tests").getOrCreate() ) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 087cfaaa20b..a559cf2bab9 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -1281,10 +1281,10 @@ class Column: Examples -------- - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show() + >>> df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show() +-----+------------------------------------------------------------+ | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END| +-----+------------------------------------------------------------+ @@ -1327,10 +1327,10 @@ class Column: Examples -------- - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show() + >>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show() +-----+-------------------------------------+ | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END| +-----+-------------------------------------+ diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 35f2c70f8c9..30ed73d3c47 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2543,14 +2543,14 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Row(a=5, left_val='b', right_val=3), Row(a=10, left_val='c', right_val=7)] - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> left._joinAsOf( - ... right, leftAsOfColumn="a", rightAsOfColumn="a", tolerance=F.lit(1) + ... right, leftAsOfColumn="a", rightAsOfColumn="a", tolerance=sf.lit(1) ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() [Row(a=1, left_val='a', right_val=1)] >>> left._joinAsOf( - ... right, leftAsOfColumn="a", rightAsOfColumn="a", how="left", tolerance=F.lit(1) + ... right, leftAsOfColumn="a", rightAsOfColumn="a", how="left", tolerance=sf.lit(1) ... ).select(left.a, 'left_val', 'right_val').sort("a").collect() [Row(a=1, left_val='a', right_val=1), Row(a=5, left_val='b', right_val=None), @@ -3679,7 +3679,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Examples -------- - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"]) >>> df.agg({"age": "max"}).show() +--------+ @@ -3687,7 +3687,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): +--------+ | 5| +--------+ - >>> df.agg(F.min(df.age)).show() + >>> df.agg(sf.min(df.age)).show() +--------+ |min(age)| +--------+ diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index c5c84518eb9..398ad15676c 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -4311,8 +4311,8 @@ def monotonically_increasing_id() -> Column: Examples -------- - >>> from pyspark.sql import functions as F - >>> spark.range(0, 10, 1, 2).select(F.monotonically_increasing_id()).show() + >>> from pyspark.sql import functions as sf + >>> spark.range(0, 10, 1, 2).select(sf.monotonically_increasing_id()).show() +-----------------------------+ |monotonically_increasing_id()| +-----------------------------+ @@ -4611,8 +4611,8 @@ def rand(seed: Optional[int] = None) -> Column: Examples -------- - >>> from pyspark.sql import functions as F - >>> spark.range(0, 2, 1, 1).withColumn('rand', F.rand(seed=42) * 3).show() + >>> from pyspark.sql import functions as sf + >>> spark.range(0, 2, 1, 1).withColumn('rand', sf.rand(seed=42) * 3).show() +---+------------------+ | id| rand| +---+------------------+ @@ -4652,8 +4652,8 @@ def randn(seed: Optional[int] = None) -> Column: Examples -------- - >>> from pyspark.sql import functions as F - >>> spark.range(0, 2, 1, 1).withColumn('randn', F.randn(seed=42)).show() + >>> from pyspark.sql import functions as sf + >>> spark.range(0, 2, 1, 1).withColumn('randn', sf.randn(seed=42)).show() +---+------------------+ | id| randn| +---+------------------+ @@ -5154,9 +5154,9 @@ def log(arg1: Union["ColumnOrName", float], arg2: Optional["ColumnOrName"] = Non Examples -------- - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> df = spark.sql("SELECT * FROM VALUES (1), (2), (4) AS t(value)") - >>> df.select(F.log(2.0, df.value).alias('log2_value')).show() + >>> df.select(sf.log(2.0, df.value).alias('log2_value')).show() +----------+ |log2_value| +----------+ @@ -5167,7 +5167,7 @@ def log(arg1: Union["ColumnOrName", float], arg2: Optional["ColumnOrName"] = Non And Natural logarithm - >>> df.select(F.log(df.value).alias('ln_value')).show() + >>> df.select(sf.log(df.value).alias('ln_value')).show() +------------------+ | ln_value| +------------------+ @@ -11571,11 +11571,11 @@ def explode(col: "ColumnOrName") -> Column: Examples -------- >>> from pyspark.sql import Row - >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect() + >>> df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) + >>> df.select(explode(df.intlist).alias("anInt")).collect() [Row(anInt=1), Row(anInt=2), Row(anInt=3)] - >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show() + >>> df.select(explode(df.mapfield).alias("key", "value")).show() +---+-----+ |key|value| +---+-----+ @@ -11610,11 +11610,11 @@ def posexplode(col: "ColumnOrName") -> Column: Examples -------- >>> from pyspark.sql import Row - >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> eDF.select(posexplode(eDF.intlist)).collect() + >>> df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) + >>> df.select(posexplode(df.intlist)).collect() [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)] - >>> eDF.select(posexplode(eDF.mapfield)).show() + >>> df.select(posexplode(df.mapfield)).show() +---+---+-----+ |pos|key|value| +---+---+-----+ diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 1b64e7666fd..730f4736909 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -125,7 +125,7 @@ class GroupedData(PandasGroupedOpsMixin): Examples -------- - >>> from pyspark.sql import functions as F + >>> from pyspark.sql import functions as sf >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame( ... [(2, "Alice"), (3, "Alice"), (5, "Bob"), (10, "Bob")], ["age", "name"]) @@ -154,7 +154,7 @@ class GroupedData(PandasGroupedOpsMixin): Group-by name, and calculate the minimum age. - >>> df.groupBy(df.name).agg(F.min(df.age)).sort("name").show() + >>> df.groupBy(df.name).agg(sf.min(df.age)).sort("name").show() +-----+--------+ | name|min(age)| +-----+--------+ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org