This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b96e13d09361 [SPARK-46595][PYTHON][DOCS] Refine docstring of `map_from_arrays/map_from_entries/map_concat` b96e13d09361 is described below commit b96e13d09361b7aaf084333ff07ed5ad610addfe Author: yangjie01 <yangji...@baidu.com> AuthorDate: Fri Jan 5 13:55:07 2024 +0800 [SPARK-46595][PYTHON][DOCS] Refine docstring of `map_from_arrays/map_from_entries/map_concat` ### What changes were proposed in this pull request? This pr refine docstring of `map_from_arrays/map_from_entries/map_concat` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #44595 from LuciferYang/SPARK-46595. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: yangjie01 <yangji...@baidu.com> --- python/pyspark/sql/functions/builtin.py | 199 ++++++++++++++++++++++++++------ 1 file changed, 165 insertions(+), 34 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 50d3b6cc01ce..bd151050e77f 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -11989,8 +11989,9 @@ def create_map( @_try_remote_functions def map_from_arrays(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: - """Creates a new map from two arrays. - + """ + Map function: Creates a new map from two arrays. This function takes two arrays of + keys and values respectively, and returns a new map column. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 @@ -11999,30 +12000,59 @@ def map_from_arrays(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: Parameters ---------- col1 : :class:`~pyspark.sql.Column` or str - name of column containing a set of keys. All elements should not be null + Name of column containing a set of keys. All elements should not be null. col2 : :class:`~pyspark.sql.Column` or str - name of column containing a set of values + Name of column containing a set of values. Returns ------- :class:`~pyspark.sql.Column` - a column of map type. + A column of map type. + + Notes + ----- + The input arrays for keys and values must have the same length and all elements + in keys should not be null. If these conditions are not met, an exception will be thrown. Examples -------- + Example 1: Basic usage of map_from_arrays + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v']) - >>> df = df.select(map_from_arrays(df.k, df.v).alias("col")) - >>> df.show() - +----------------+ - | col| - +----------------+ - |{2 -> a, 5 -> b}| - +----------------+ - >>> df.printSchema() - root - |-- col: map (nullable = true) - | |-- key: long - | |-- value: string (valueContainsNull = true) + >>> df.select(sf.map_from_arrays(df.k, df.v)).show() + +---------------------+ + |map_from_arrays(k, v)| + +---------------------+ + | {2 -> a, 5 -> b}| + +---------------------+ + + Example 2: map_from_arrays with null values + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2], ['a', None])], ['k', 'v']) + >>> df.select(sf.map_from_arrays(df.k, df.v)).show() + +---------------------+ + |map_from_arrays(k, v)| + +---------------------+ + | {1 -> a, 2 -> NULL}| + +---------------------+ + + Example 3: map_from_arrays with empty arrays + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField('k', ArrayType(IntegerType())), + ... StructField('v', ArrayType(StringType())) + ... ]) + >>> df = spark.createDataFrame([([], [])], schema=schema) + >>> df.select(sf.map_from_arrays(df.k, df.v)).show() + +---------------------+ + |map_from_arrays(k, v)| + +---------------------+ + | {}| + +---------------------+ """ return _invoke_function_over_columns("map_from_arrays", col1, col2) @@ -15581,8 +15611,9 @@ def map_entries(col: "ColumnOrName") -> Column: @_try_remote_functions def map_from_entries(col: "ColumnOrName") -> Column: """ - Collection function: Converts an array of entries (key value struct types) to a map - of values. + Map function: Transforms an array of key-value pair entries (structs with two fields) + into a map. The first field of each entry is used as the key and the second field + as the value in the resulting map column .. versionadded:: 2.4.0 @@ -15592,23 +15623,68 @@ def map_from_entries(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - name of column or expression + Name of column or expression Returns ------- :class:`~pyspark.sql.Column` - a map created from the given array of entries. + A map created from the given array of entries. Examples -------- - >>> from pyspark.sql.functions import map_from_entries + Example 1: Basic usage of map_from_entries + + >>> from pyspark.sql import functions as sf >>> df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data") - >>> df.select(map_from_entries("data").alias("map")).show() - +----------------+ - | map| - +----------------+ - |{1 -> a, 2 -> b}| - +----------------+ + >>> df.select(sf.map_from_entries(df.data)).show() + +----------------------+ + |map_from_entries(data)| + +----------------------+ + | {1 -> a, 2 -> b}| + +----------------------+ + + Example 2: map_from_entries with null values + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT array(struct(1, null), struct(2, 'b')) as data") + >>> df.select(sf.map_from_entries(df.data)).show() + +----------------------+ + |map_from_entries(data)| + +----------------------+ + | {1 -> NULL, 2 -> b}| + +----------------------+ + + Example 3: map_from_entries with a DataFrame + + >>> from pyspark.sql import Row, functions as sf + >>> df = spark.createDataFrame([([Row(1, "a"), Row(2, "b")],), ([Row(3, "c")],)], ['data']) + >>> df.select(sf.map_from_entries(df.data)).show() + +----------------------+ + |map_from_entries(data)| + +----------------------+ + | {1 -> a, 2 -> b}| + | {3 -> c}| + +----------------------+ + + Example 4: map_from_entries with empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType( + ... StructType([ + ... StructField("key", IntegerType()), + ... StructField("value", StringType()) + ... ]) + ... ), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.map_from_entries(df.data)).show() + +----------------------+ + |map_from_entries(data)| + +----------------------+ + | {}| + +----------------------+ """ return _invoke_function_over_columns("map_from_entries", col) @@ -15778,7 +15854,8 @@ def map_concat(__cols: Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]] def map_concat( *cols: Union["ColumnOrName", Union[List["ColumnOrName_"], Tuple["ColumnOrName_", ...]]] ) -> Column: - """Returns the union of all the given maps. + """ + Map function: Returns the union of all given maps. .. versionadded:: 2.4.0 @@ -15788,23 +15865,77 @@ def map_concat( Parameters ---------- cols : :class:`~pyspark.sql.Column` or str - column names or :class:`~pyspark.sql.Column`\\s + Column names or :class:`~pyspark.sql.Column` Returns ------- :class:`~pyspark.sql.Column` - a map of merged entries from other maps. + A map of merged entries from other maps. + + Notes + ----- + For duplicate keys in input maps, the handling is governed by `spark.sql.mapKeyDedupPolicy`. + By default, it throws an exception. If set to `LAST_WIN`, it uses the last map's value. Examples -------- - >>> from pyspark.sql.functions import map_concat + Example 1: Basic usage of map_concat + + >>> from pyspark.sql import functions as sf >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2") - >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False) + >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) +------------------------+ - |map3 | + |map_concat(map1, map2) | +------------------------+ |{1 -> a, 2 -> b, 3 -> c}| +------------------------+ + + Example 2: map_concat with overlapping keys + + >>> from pyspark.sql import functions as sf + >>> originalmapKeyDedupPolicy = spark.conf.get("spark.sql.mapKeyDedupPolicy") + >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", "LAST_WIN") + >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(2, 'c', 3, 'd') as map2") + >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) + +------------------------+ + |map_concat(map1, map2) | + +------------------------+ + |{1 -> a, 2 -> c, 3 -> d}| + +------------------------+ + >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", originalmapKeyDedupPolicy) + + Example 3: map_concat with three maps + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT map(1, 'a') as map1, map(2, 'b') as map2, map(3, 'c') as map3") + >>> df.select(sf.map_concat("map1", "map2", "map3")).show(truncate=False) + +----------------------------+ + |map_concat(map1, map2, map3)| + +----------------------------+ + |{1 -> a, 2 -> b, 3 -> c} | + +----------------------------+ + + Example 4: map_concat with empty map + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map() as map2") + >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) + +----------------------+ + |map_concat(map1, map2)| + +----------------------+ + |{1 -> a, 2 -> b} | + +----------------------+ + + Example 5: map_concat with null values + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, null) as map2") + >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) + +---------------------------+ + |map_concat(map1, map2) | + +---------------------------+ + |{1 -> a, 2 -> b, 3 -> NULL}| + +---------------------------+ """ if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] # type: ignore[assignment] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org