This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new 89d89306 feat: reduce duplicate fields on join (#1184)
89d89306 is described below
commit 89d8930625c7a903fb0e5f1d4f5310341e0b4fea
Author: Tim Saucer <[email protected]>
AuthorDate: Mon Nov 10 08:52:19 2025 -0500
feat: reduce duplicate fields on join (#1184)
* Add field to dataframe join to indicate if we should keep duplicate keys
* Suppress expected warning
* Minor: small tables rendered way too large
* Rename from keep_duplicate_keys to drop_duplicate_keys
* Add unit tests for dropping duplicate keys or not
* Update online docs
* Update docs/source/user-guide/common-operations/joins.rst
Co-authored-by: kosiew <[email protected]>
---------
Co-authored-by: kosiew <[email protected]>
---
docs/source/user-guide/common-operations/joins.rst | 34 +++++++++++++++++++-
python/datafusion/dataframe.py | 32 ++++++++++++-------
python/datafusion/dataframe_formatter.py | 2 +-
python/tests/test_dataframe.py | 28 ++++++++++++-----
python/tests/test_sql.py | 3 ++
src/dataframe.rs | 36 +++++++++++++++++++++-
6 files changed, 113 insertions(+), 22 deletions(-)
diff --git a/docs/source/user-guide/common-operations/joins.rst
b/docs/source/user-guide/common-operations/joins.rst
index 40d92215..035d7488 100644
--- a/docs/source/user-guide/common-operations/joins.rst
+++ b/docs/source/user-guide/common-operations/joins.rst
@@ -101,4 +101,36 @@ the right table.
.. ipython:: python
- left.join(right, left_on="customer_id", right_on="id", how="anti")
\ No newline at end of file
+ left.join(right, left_on="customer_id", right_on="id", how="anti")
+
+Duplicate Keys
+--------------
+
+It is common to join two DataFrames on a common column name. Starting in
+version 51.0.0, ``datafusion-python``` will now drop duplicate column names by
+default. This reduces problems with ambiguous column selection after joins.
+You can disable this feature by setting the parameter ``drop_duplicate_keys``
+to ``False``.
+
+.. ipython:: python
+
+ left = ctx.from_pydict(
+ {
+ "id": [1, 2, 3],
+ "customer": ["Alice", "Bob", "Charlie"],
+ }
+ )
+
+ right = ctx.from_pylist([
+ {"id": 1, "name": "CityCabs"},
+ {"id": 2, "name": "MetroRide"},
+ {"id": 5, "name": "UrbanGo"},
+ ])
+
+ left.join(right, "id", how="inner")
+
+In contrast to the above example, if we wish to get both columns:
+
+.. ipython:: python
+
+ left.join(right, "id", how="inner", drop_duplicate_keys=False)
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index c6ff7eda..b3b48e96 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -774,6 +774,7 @@ class DataFrame:
left_on: None = None,
right_on: None = None,
join_keys: None = None,
+ drop_duplicate_keys: bool = True,
) -> DataFrame: ...
@overload
@@ -786,6 +787,7 @@ class DataFrame:
left_on: str | Sequence[str],
right_on: str | Sequence[str],
join_keys: tuple[list[str], list[str]] | None = None,
+ drop_duplicate_keys: bool = True,
) -> DataFrame: ...
@overload
@@ -798,6 +800,7 @@ class DataFrame:
join_keys: tuple[list[str], list[str]],
left_on: None = None,
right_on: None = None,
+ drop_duplicate_keys: bool = True,
) -> DataFrame: ...
def join(
@@ -809,6 +812,7 @@ class DataFrame:
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
join_keys: tuple[list[str], list[str]] | None = None,
+ drop_duplicate_keys: bool = True,
) -> DataFrame:
"""Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.
@@ -821,11 +825,23 @@ class DataFrame:
"right", "full", "semi", "anti".
left_on: Join column of the left dataframe.
right_on: Join column of the right dataframe.
+ drop_duplicate_keys: When True, the columns from the right
DataFrame
+ that have identical names in the ``on`` fields to the left
DataFrame
+ will be dropped.
join_keys: Tuple of two lists of column names to join on.
[Deprecated]
Returns:
DataFrame after join.
"""
+ if join_keys is not None:
+ warnings.warn(
+ "`join_keys` is deprecated, use `on` or `left_on` with
`right_on`",
+ category=DeprecationWarning,
+ stacklevel=2,
+ )
+ left_on = join_keys[0]
+ right_on = join_keys[1]
+
# This check is to prevent breaking API changes where users prior to
# DF 43.0.0 would pass the join_keys as a positional argument instead
# of a keyword argument.
@@ -836,18 +852,10 @@ class DataFrame:
and isinstance(on[1], list)
):
# We know this is safe because we've checked the types
- join_keys = on # type: ignore[assignment]
+ left_on = on[0]
+ right_on = on[1]
on = None
- if join_keys is not None:
- warnings.warn(
- "`join_keys` is deprecated, use `on` or `left_on` with
`right_on`",
- category=DeprecationWarning,
- stacklevel=2,
- )
- left_on = join_keys[0]
- right_on = join_keys[1]
-
if on is not None:
if left_on is not None or right_on is not None:
error_msg = "`left_on` or `right_on` should not provided with
`on`"
@@ -866,7 +874,9 @@ class DataFrame:
if isinstance(right_on, str):
right_on = [right_on]
- return DataFrame(self.df.join(right.df, how, left_on, right_on))
+ return DataFrame(
+ self.df.join(right.df, how, left_on, right_on, drop_duplicate_keys)
+ )
def join_on(
self,
diff --git a/python/datafusion/dataframe_formatter.py
b/python/datafusion/dataframe_formatter.py
index 4082ff4e..bb53d323 100644
--- a/python/datafusion/dataframe_formatter.py
+++ b/python/datafusion/dataframe_formatter.py
@@ -370,7 +370,7 @@ if (!document.getElementById('df-styles')) {{
f"max-height: {self.max_height}px; overflow: auto; border: "
'1px solid #ccc;">'
)
- html.append('<table style="border-collapse: collapse; min-width:
100%">')
+ html.append('<table style="border-collapse: collapse">')
return html
def _build_table_header(self, schema: Any) -> list[str]:
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 101dfc5b..8292e258 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -647,7 +647,6 @@ def test_unnest_without_nulls(nested_df):
assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9])
[email protected]("ignore:`join_keys`:DeprecationWarning")
def test_join():
ctx = SessionContext()
@@ -664,25 +663,38 @@ def test_join():
df1 = ctx.create_dataframe([[batch]], "r")
df2 = df.join(df1, on="a", how="inner")
- df2.show()
df2 = df2.sort(column("l.a"))
table = pa.Table.from_batches(df2.collect())
expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
assert table.to_pydict() == expected
- df2 = df.join(df1, left_on="a", right_on="a", how="inner")
- df2.show()
+ # Test the default behavior for dropping duplicate keys
+ # Since we may have a duplicate column name and pa.Table()
+ # hides the fact, instead we need to explicitly check the
+ # resultant arrays.
+ df2 = df.join(df1, left_on="a", right_on="a", how="inner",
drop_duplicate_keys=True)
df2 = df2.sort(column("l.a"))
- table = pa.Table.from_batches(df2.collect())
+ result = df2.collect()[0]
+ assert result.num_columns == 3
+ assert result.column(0) == pa.array([1, 2], pa.int64())
+ assert result.column(1) == pa.array([4, 5], pa.int64())
+ assert result.column(2) == pa.array([8, 10], pa.int64())
- expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
- assert table.to_pydict() == expected
+ df2 = df.join(
+ df1, left_on="a", right_on="a", how="inner", drop_duplicate_keys=False
+ )
+ df2 = df2.sort(column("l.a"))
+ result = df2.collect()[0]
+ assert result.num_columns == 4
+ assert result.column(0) == pa.array([1, 2], pa.int64())
+ assert result.column(1) == pa.array([4, 5], pa.int64())
+ assert result.column(2) == pa.array([1, 2], pa.int64())
+ assert result.column(3) == pa.array([8, 10], pa.int64())
# Verify we don't make a breaking change to pre-43.0.0
# where users would pass join_keys as a positional argument
df2 = df.join(df1, (["a"], ["a"]), how="inner")
- df2.show()
df2 = df2.sort(column("l.a"))
table = pa.Table.from_batches(df2.collect())
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
index 8f57992d..8d1f3009 100644
--- a/python/tests/test_sql.py
+++ b/python/tests/test_sql.py
@@ -157,6 +157,9 @@ def test_register_parquet(ctx, tmp_path):
assert result.to_pydict() == {"cnt": [100]}
[email protected](
+ "ignore:using literals for table_partition_cols data
types:DeprecationWarning"
+)
@pytest.mark.parametrize(
("path_to_str", "legacy_data_type"), [(True, False), (False, False),
(False, True)]
)
diff --git a/src/dataframe.rs b/src/dataframe.rs
index a93aa018..187bb0ac 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -629,6 +629,7 @@ impl PyDataFrame {
how: &str,
left_on: Vec<PyBackedStr>,
right_on: Vec<PyBackedStr>,
+ drop_duplicate_keys: bool,
) -> PyDataFusionResult<Self> {
let join_type = match how {
"inner" => JoinType::Inner,
@@ -647,13 +648,46 @@ impl PyDataFrame {
let left_keys = left_on.iter().map(|s|
s.as_ref()).collect::<Vec<&str>>();
let right_keys = right_on.iter().map(|s|
s.as_ref()).collect::<Vec<&str>>();
- let df = self.df.as_ref().clone().join(
+ let mut df = self.df.as_ref().clone().join(
right.df.as_ref().clone(),
join_type,
&left_keys,
&right_keys,
None,
)?;
+
+ if drop_duplicate_keys {
+ let mutual_keys = left_keys
+ .iter()
+ .zip(right_keys.iter())
+ .filter(|(l, r)| l == r)
+ .map(|(key, _)| *key)
+ .collect::<Vec<_>>();
+
+ let fields_to_drop = mutual_keys
+ .iter()
+ .map(|name| {
+ df.logical_plan()
+ .schema()
+ .qualified_fields_with_unqualified_name(name)
+ })
+ .filter(|r| r.len() == 2)
+ .map(|r| r[1])
+ .collect::<Vec<_>>();
+
+ let expr: Vec<Expr> = df
+ .logical_plan()
+ .schema()
+ .fields()
+ .into_iter()
+ .enumerate()
+ .map(|(idx, _)|
df.logical_plan().schema().qualified_field(idx))
+ .filter(|(qualifier, f)|
!fields_to_drop.contains(&(*qualifier, f)))
+ .map(|(qualifier, field)|
Expr::Column(Column::from((qualifier, field))))
+ .collect();
+ df = df.select(expr)?;
+ }
+
Ok(Self::new(df))
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]