This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6d2ffaa4ea8 [SPARK-45353][PYTHON][DOCS] Refine docstring of `create_map/slice/array_join` 6d2ffaa4ea8 is described below commit 6d2ffaa4ea87679ce527512f11d04d136a1d536a Author: yangjie01 <yangji...@baidu.com> AuthorDate: Thu Sep 28 11:03:47 2023 +0800 [SPARK-45353][PYTHON][DOCS] Refine docstring of `create_map/slice/array_join` ### What changes were proposed in this pull request? This pr refine docstring of `create_map/slice/array_join` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #43145 from LuciferYang/collection-functions-2. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: yangjie01 <yangji...@baidu.com> --- python/pyspark/sql/functions.py | 191 ++++++++++++++++++++++++++++++++++------ 1 file changed, 163 insertions(+), 28 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index f54ce66e39f..04968440e39 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -11684,7 +11684,12 @@ def create_map(__cols: Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]] def create_map( *cols: Union["ColumnOrName", Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]]] ) -> Column: - """Creates a new map column. + """ + Map function: Creates a new map column from an even number of input columns or + column references. The input columns are grouped into key-value pairs to form a map. + For instance, the input (key1, value1, key2, value2, ...) would produce a map that + associates key1 with value1, key2 with value2, and so on. The function supports + grouping columns as a list as well. .. versionadded:: 2.0.0 @@ -11694,16 +11699,54 @@ def create_map( Parameters ---------- cols : :class:`~pyspark.sql.Column` or str - column names or :class:`~pyspark.sql.Column`\\s that are - grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...). + The input column names or :class:`~pyspark.sql.Column` objects grouped into + key-value pairs. These can also be expressed as a list of columns. + + Returns + ------- + :class:`~pyspark.sql.Column` + A new Column of Map type, where each value is a map formed from the corresponding + key-value pairs provided in the input arguments. Examples -------- + Example 1: Basic usage of create_map function. + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) - >>> df.select(create_map('name', 'age').alias("map")).collect() - [Row(map={'Alice': 2}), Row(map={'Bob': 5})] - >>> df.select(create_map([df.name, df.age]).alias("map")).collect() - [Row(map={'Alice': 2}), Row(map={'Bob': 5})] + >>> df.select(sf.create_map('name', 'age')).show() + +--------------+ + |map(name, age)| + +--------------+ + | {Alice -> 2}| + | {Bob -> 5}| + +--------------+ + + Example 2: Usage of create_map function with a list of columns. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) + >>> df.select(sf.create_map([df.name, df.age])).show() + +--------------+ + |map(name, age)| + +--------------+ + | {Alice -> 2}| + | {Bob -> 5}| + +--------------+ + + Example 3: Usage of create_map function with more than one key-value pair. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("Alice", 2, "female"), + ... ("Bob", 5, "male")], ("name", "age", "gender")) + >>> df.select(sf.create_map(sf.lit('name'), df['name'], + ... sf.lit('age'), df['age'])).show(truncate=False) + +-------------------------+ + |map(name, name, age, age)| + +-------------------------+ + |{name -> Alice, age -> 2}| + |{name -> Bob, age -> 5} | + +-------------------------+ """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] # type: ignore[assignment] @@ -12002,8 +12045,9 @@ def slice( x: "ColumnOrName", start: Union["ColumnOrName", int], length: Union["ColumnOrName", int] ) -> Column: """ - Collection function: returns an array containing all the elements in `x` from index `start` - (array indices start at 1, or from the end if `start` is negative) with the specified `length`. + Array function: Returns a new array column by slicing the input array column from + a start index to a specific length. The indices start at 1, and can be negative to index + from the end of the array. The length specifies the number of elements in the resulting array. .. versionadded:: 2.4.0 @@ -12013,22 +12057,56 @@ def slice( Parameters ---------- x : :class:`~pyspark.sql.Column` or str - column name or column containing the array to be sliced - start : :class:`~pyspark.sql.Column` or str or int - column name, column, or int containing the starting index - length : :class:`~pyspark.sql.Column` or str or int - column name, column, or int containing the length of the slice + Input array column or column name to be sliced. + start : :class:`~pyspark.sql.Column`, str, or int + The start index for the slice operation. If negative, starts the index from the + end of the array. + length : :class:`~pyspark.sql.Column`, str, or int + The length of the slice, representing number of elements in the resulting array. Returns ------- :class:`~pyspark.sql.Column` - a column of array type. Subset of array. + A new Column object of Array type, where each value is a slice of the corresponding + list from the input column. Examples -------- + Example 1: Basic usage of the slice function. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) + >>> df.select(sf.slice(df.x, 2, 2)).show() + +--------------+ + |slice(x, 2, 2)| + +--------------+ + | [2, 3]| + | [5]| + +--------------+ + + Example 2: Slicing with negative start index. + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) - >>> df.select(slice(df.x, 2, 2).alias("sliced")).collect() - [Row(sliced=[2, 3]), Row(sliced=[5])] + >>> df.select(sf.slice(df.x, -1, 1)).show() + +---------------+ + |slice(x, -1, 1)| + +---------------+ + | [3]| + | [5]| + +---------------+ + + Example 3: Slice function with column inputs for start and length. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3], 2, 2), ([4, 5], 1, 3)], ['x', 'start', 'length']) + >>> df.select(sf.slice(df.x, df.start, df.length)).show() + +-----------------------+ + |slice(x, start, length)| + +-----------------------+ + | [2, 3]| + | [4, 5]| + +-----------------------+ """ start = lit(start) if isinstance(start, int) else start length = lit(length) if isinstance(length, int) else length @@ -12041,8 +12119,10 @@ def array_join( col: "ColumnOrName", delimiter: str, null_replacement: Optional[str] = None ) -> Column: """ - Concatenates the elements of `column` using the `delimiter`. Null values are replaced with - `null_replacement` if set, otherwise they are ignored. + Array function: Returns a string column by concatenating the elements of the input + array column using the delimiter. Null values within the array can be replaced with + a specified string through the null_replacement argument. If null_replacement is + not set, null values are ignored. .. versionadded:: 2.4.0 @@ -12052,24 +12132,79 @@ def array_join( Parameters ---------- col : :class:`~pyspark.sql.Column` or str - target column to work on. + The input column containing the arrays to be joined. delimiter : str - delimiter used to concatenate elements + The string to be used as the delimiter when joining the array elements. null_replacement : str, optional - if set then null values will be replaced by this value + The string to replace null values within the array. If not set, null values are ignored. Returns ------- :class:`~pyspark.sql.Column` - a column of string type. Concatenated values. + A new column of string type, where each value is the result of joining the corresponding + array from the input column. Examples -------- - >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data']) - >>> df.select(array_join(df.data, ",").alias("joined")).collect() - [Row(joined='a,b,c'), Row(joined='a')] - >>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect() - [Row(joined='a,b,c'), Row(joined='a,NULL')] + Example 1: Basic usage of array_join function. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", "b"],)], ['data']) + >>> df.select(sf.array_join(df.data, ",")).show() + +-------------------+ + |array_join(data, ,)| + +-------------------+ + | a,b,c| + | a,b| + +-------------------+ + + Example 2: Usage of array_join function with null_replacement argument. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", None, "c"],)], ['data']) + >>> df.select(sf.array_join(df.data, ",", "NULL")).show() + +-------------------------+ + |array_join(data, ,, NULL)| + +-------------------------+ + | a,NULL,c| + +-------------------------+ + + Example 3: Usage of array_join function without null_replacement argument. + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", None, "c"],)], ['data']) + >>> df.select(sf.array_join(df.data, ",")).show() + +-------------------+ + |array_join(data, ,)| + +-------------------+ + | a,c| + +-------------------+ + + Example 4: Usage of array_join function with an array that is null. + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, ArrayType, StringType + >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) + >>> df = spark.createDataFrame([(None,)], schema) + >>> df.select(sf.array_join(df.data, ",")).show() + +-------------------+ + |array_join(data, ,)| + +-------------------+ + | NULL| + +-------------------+ + + Example 5: Usage of array_join function with an array containing only null values. + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StructType, StructField, ArrayType, StringType + >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) + >>> df = spark.createDataFrame([([None, None],)], schema) + >>> df.select(sf.array_join(df.data, ",", "NULL")).show() + +-------------------------+ + |array_join(data, ,, NULL)| + +-------------------------+ + | NULL,NULL| + +-------------------------+ """ _get_active_spark_context() if null_replacement is None: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org