Copilot commented on code in PR #1461:
URL:
https://github.com/apache/datafusion-python/pull/1461#discussion_r3010881901
##########
python/tests/test_functions.py:
##########
@@ -668,6 +668,154 @@ def test_array_function_obj_tests(stmt, py_expr):
assert a == b
+def test_map_from_dict():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ result = df.select(f.map({"x": 1, "y":
2}).alias("m")).collect()[0].column(0)
+ assert result[0].as_py() == [("x", 1), ("y", 2)]
+
+
+def test_map_from_dict_with_expr_values():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ result = (
+ df.select(f.map({"x": literal(1), "y": literal(2)}).alias("m"))
+ .collect()[0]
+ .column(0)
+ )
+ assert result[0].as_py() == [("x", 1), ("y", 2)]
+
+
+def test_map_from_two_lists():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays(
+ [
+ pa.array(["k1", "k2", "k3"]),
+ pa.array([10, 20, 30]),
+ ],
+ names=["keys", "vals"],
+ )
+ df = ctx.create_dataframe([[batch]])
+
+ m = f.map([column("keys")], [column("vals")])
+ result = df.select(f.map_keys(m).alias("k")).collect()[0].column(0)
+ for i, expected in enumerate(["k1", "k2", "k3"]):
+ assert result[i].as_py() == [expected]
+
+ result = df.select(f.map_values(m).alias("v")).collect()[0].column(0)
+ for i, expected in enumerate([10, 20, 30]):
+ assert result[i].as_py() == [expected]
+
+
+def test_map_from_variadic_pairs():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ result = df.select(f.map("x", 1, "y", 2).alias("m")).collect()[0].column(0)
+ assert result[0].as_py() == [("x", 1), ("y", 2)]
+
+
+def test_map_variadic_with_exprs():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ result = (
+ df.select(f.map(literal("x"), literal(1), literal("y"),
literal(2)).alias("m"))
+ .collect()[0]
+ .column(0)
+ )
+ assert result[0].as_py() == [("x", 1), ("y", 2)]
+
+
+def test_map_odd_args_raises():
+ with pytest.raises(ValueError, match="map expects"):
+ f.map("x", 1, "y")
+
+
+def test_make_map_is_alias():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ result = df.select(f.make_map({"x": 1, "y":
2}).alias("m")).collect()[0].column(0)
+ assert result[0].as_py() == [("x", 1), ("y", 2)]
+
+
+def test_map_keys():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ m = f.map({"x": 1, "y": 2})
+ result = df.select(f.map_keys(m).alias("keys")).collect()[0].column(0)
+ assert result[0].as_py() == ["x", "y"]
+
+
+def test_map_values():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ m = f.map({"x": 1, "y": 2})
+ result = df.select(f.map_values(m).alias("vals")).collect()[0].column(0)
+ assert result[0].as_py() == [1, 2]
+
+
+def test_map_extract():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ m = f.map({"x": 1, "y": 2})
+ result = (
+ df.select(f.map_extract(m,
literal("x")).alias("val")).collect()[0].column(0)
+ )
+ assert result[0].as_py() == [1]
+
+
+def test_map_extract_missing_key():
+ ctx = SessionContext()
+ batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"])
+ df = ctx.create_dataframe([[batch]])
+
+ m = f.map({"x": 1})
+ result = (
+ df.select(f.map_extract(m,
literal("z")).alias("val")).collect()[0].column(0)
+ )
+ assert result[0].as_py() == [None]
Review Comment:
`map_extract` is documented as returning an “empty list if absent”, but the
new test `test_map_extract_missing_key` asserts `[None]` for a missing key.
These two behaviors are different (empty list vs list containing null). Please
verify the actual DataFusion semantics and update either the docstring or the
test expectation so they match.
```suggestion
assert result[0].as_py() == []
```
##########
python/datafusion/functions.py:
##########
@@ -3338,6 +3345,157 @@ def empty(array: Expr) -> Expr:
return array_empty(array)
+# map functions
+
+
+def map(*args: Any) -> Expr:
+ """Returns a map expression.
+
+ Supports three calling conventions:
+
+ - ``map({"a": 1, "b": 2})`` — from a Python dictionary.
+ - ``map([keys], [values])`` — from a list of keys and a list of
+ their associated values. Both lists must be the same length.
+ - ``map(k1, v1, k2, v2, ...)`` — from alternating keys and their
+ associated values.
+
+ Keys and values that are not already :py:class:`~datafusion.expr.Expr`
+ are automatically converted to literal expressions.
+
+ Examples:
+ From a dictionary:
+
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.map({"a": 1, "b": 2}).alias("m"))
+ >>> result.collect_column("m")[0].as_py()
+ [('a', 1), ('b', 2)]
+
+ From two lists:
+
+ >>> df = ctx.from_pydict({"key": ["x", "y"], "val": [10, 20]})
+ >>> df = df.select(
+ ... dfn.functions.map(
+ ... [dfn.col("key")], [dfn.col("val")]
+ ... ).alias("m"))
+ >>> df.collect_column("m")[0].as_py()
+ [('x', 10)]
+
+ From alternating keys and values:
+
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.map("x", 1, "y", 2).alias("m"))
+ >>> result.collect_column("m")[0].as_py()
+ [('x', 1), ('y', 2)]
+ """
+ if len(args) == 1 and isinstance(args[0], dict):
+ key_list = list(args[0].keys())
+ value_list = list(args[0].values())
+ elif (
+ len(args) == 2 # noqa: PLR2004
+ and isinstance(args[0], list)
+ and isinstance(args[1], list)
+ ):
+ key_list = args[0]
+ value_list = args[1]
+ elif len(args) >= 2 and len(args) % 2 == 0: # noqa: PLR2004
Review Comment:
When called as `map([keys], [values])`, the docstring states both lists must
be the same length, but the implementation doesn’t validate `len(key_list) ==
len(value_list)`. If lengths differ, the error will be deferred to the
Rust/DataFusion layer (likely with a less actionable message). Consider adding
an explicit length check here and raising `ValueError` with a clear message.
##########
python/datafusion/functions.py:
##########
@@ -3338,6 +3345,157 @@ def empty(array: Expr) -> Expr:
return array_empty(array)
+# map functions
+
+
+def map(*args: Any) -> Expr:
+ """Returns a map expression.
+
+ Supports three calling conventions:
+
+ - ``map({"a": 1, "b": 2})`` — from a Python dictionary.
+ - ``map([keys], [values])`` — from a list of keys and a list of
+ their associated values. Both lists must be the same length.
+ - ``map(k1, v1, k2, v2, ...)`` — from alternating keys and their
+ associated values.
+
+ Keys and values that are not already :py:class:`~datafusion.expr.Expr`
+ are automatically converted to literal expressions.
+
+ Examples:
+ From a dictionary:
+
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.map({"a": 1, "b": 2}).alias("m"))
+ >>> result.collect_column("m")[0].as_py()
+ [('a', 1), ('b', 2)]
+
+ From two lists:
+
+ >>> df = ctx.from_pydict({"key": ["x", "y"], "val": [10, 20]})
+ >>> df = df.select(
+ ... dfn.functions.map(
+ ... [dfn.col("key")], [dfn.col("val")]
+ ... ).alias("m"))
+ >>> df.collect_column("m")[0].as_py()
+ [('x', 10)]
+
+ From alternating keys and values:
+
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> result = df.select(
+ ... dfn.functions.map("x", 1, "y", 2).alias("m"))
+ >>> result.collect_column("m")[0].as_py()
+ [('x', 1), ('y', 2)]
+ """
+ if len(args) == 1 and isinstance(args[0], dict):
+ key_list = list(args[0].keys())
+ value_list = list(args[0].values())
+ elif (
+ len(args) == 2 # noqa: PLR2004
+ and isinstance(args[0], list)
+ and isinstance(args[1], list)
+ ):
+ key_list = args[0]
+ value_list = args[1]
+ elif len(args) >= 2 and len(args) % 2 == 0: # noqa: PLR2004
+ key_list = list(args[0::2])
+ value_list = list(args[1::2])
+ else:
+ msg = "map expects a dict, two lists, or an even number of key-value
arguments"
+ raise ValueError(msg)
+
+ key_exprs = [k if isinstance(k, Expr) else Expr.literal(k) for k in
key_list]
+ val_exprs = [v if isinstance(v, Expr) else Expr.literal(v) for v in
value_list]
+ return Expr(f.make_map([k.expr for k in key_exprs], [v.expr for v in
val_exprs]))
+
+
+def make_map(*args: Any) -> Expr:
+ """Returns a map expression.
+
+ See Also:
+ This is an alias for :py:func:`map`.
+ """
+ return map(*args)
+
+
+def map_keys(map: Expr) -> Expr:
+ """Returns a list of all keys in the map.
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> df = df.select(
+ ... dfn.functions.map({"x": 1, "y": 2}).alias("m"))
+ >>> result = df.select(
+ ... dfn.functions.map_keys(dfn.col("m")).alias("keys"))
+ >>> result.collect_column("keys")[0].as_py()
+ ['x', 'y']
+ """
+ return Expr(f.map_keys(map.expr))
+
+
+def map_values(map: Expr) -> Expr:
+ """Returns a list of all values in the map.
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> df = df.select(
+ ... dfn.functions.map({"x": 1, "y": 2}).alias("m"))
+ >>> result = df.select(
+ ... dfn.functions.map_values(dfn.col("m")).alias("vals"))
+ >>> result.collect_column("vals")[0].as_py()
+ [1, 2]
+ """
+ return Expr(f.map_values(map.expr))
+
+
+def map_extract(map: Expr, key: Expr) -> Expr:
+ """Returns the value for the given key in the map, or an empty list if
absent.
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> df = df.select(
+ ... dfn.functions.map({"x": 1, "y": 2}).alias("m"))
+ >>> result = df.select(
+ ... dfn.functions.map_extract(
+ ... dfn.col("m"), dfn.lit("x")
+ ... ).alias("val"))
+ >>> result.collect_column("val")[0].as_py()
+ [1]
+ """
+ return Expr(f.map_extract(map.expr, key.expr))
+
+
+def map_entries(map: Expr) -> Expr:
+ """Returns a list of all entries (key-value struct pairs) in the map.
+
+ Examples:
+ >>> ctx = dfn.SessionContext()
+ >>> df = ctx.from_pydict({"a": [1]})
+ >>> df = df.select(
+ ... dfn.functions.map({"x": 1, "y": 2}).alias("m"))
+ >>> result = df.select(
+ ... dfn.functions.map_entries(dfn.col("m")).alias("entries"))
+ >>> result.collect_column("entries")[0].as_py()
+ [{'key': 'x', 'value': 1}, {'key': 'y', 'value': 2}]
+ """
+ return Expr(f.map_entries(map.expr))
+
+
+def element_at(map: Expr, key: Expr) -> Expr:
+ """Returns the value for the given key in the map, or an empty list if
absent.
+
+ See Also:
+ This is an alias for :py:func:`map_extract`.
+ """
Review Comment:
The issue/PR context describes `element_at` as “access element in a map or
array by key/index”, but this implementation is a direct alias for
`map_extract`, which appears map-specific and returns a list of matches. If
upstream DataFusion provides a dedicated `element_at` function (and/or supports
array indexing), consider binding that directly so `element_at` matches
upstream semantics rather than being limited to `map_extract` behavior.
```suggestion
"""Returns the element for the given key/index in a map or array.
On DataFusion versions that provide a dedicated ``element_at`` function,
this will use that function and thus match upstream semantics (including
support for both maps and arrays). On older versions, this falls back to
:py:func:`map_extract`, which returns a list of matching values for maps.
"""
# Prefer the upstream `element_at` implementation when available so that
# semantics (map/array support, scalar return) match DataFusion. Fall
back
# to the existing `map_extract`-based behavior for compatibility with
# older backends that do not yet expose `element_at`.
element_at_fn = getattr(f, "element_at", None)
if element_at_fn is not None:
return Expr(element_at_fn(map.expr, key.expr))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]