Re: [PR] Use a join for upsert deduplication [iceberg-python]

via GitHub Fri, 21 Feb 2025 08:28:49 -0800


Fokko commented on code in PR #1685:
URL: https://github.com/apache/iceberg-python/pull/1685#discussion_r1965782385



##########
pyiceberg/table/upsert_util.py:
##########
@@ -53,42 +53,18 @@ def get_rows_to_update(source_table: pa.Table, 
target_table: pa.Table, join_cols
     """
     all_columns = set(source_table.column_names)
     join_cols_set = set(join_cols)
-
-    non_key_cols = list(all_columns - join_cols_set)
-
-    match_expr = functools.reduce(operator.and_, 
[pc.field(col).isin(target_table.column(col).to_pylist()) for col in join_cols])
-
-    matching_source_rows = source_table.filter(match_expr)
-
-    rows_to_update = []
-
-    for index in range(matching_source_rows.num_rows):
-        source_row = matching_source_rows.slice(index, 1)
-
-        target_filter = functools.reduce(operator.and_, [pc.field(col) == 
source_row.column(col)[0].as_py() for col in join_cols])
-
-        matching_target_row = target_table.filter(target_filter)
-
-        if matching_target_row.num_rows > 0:
-            needs_update = False
-
-            for non_key_col in non_key_cols:
-                source_value = source_row.column(non_key_col)[0].as_py()
-                target_value = 
matching_target_row.column(non_key_col)[0].as_py()
-
-                if source_value != target_value:
-                    needs_update = True
-                    break
-
-            if needs_update:
-                rows_to_update.append(source_row)
-
-    if rows_to_update:
-        rows_to_update_table = pa.concat_tables(rows_to_update)
-    else:
-        rows_to_update_table = pa.Table.from_arrays([], 
names=source_table.column_names)
-
-    common_columns = 
set(source_table.column_names).intersection(set(target_table.column_names))
-    rows_to_update_table = rows_to_update_table.select(list(common_columns))
-
-    return rows_to_update_table
+    non_key_cols = all_columns - join_cols_set
+
+    diff_expr = functools.reduce(operator.or_, [pc.field(f"{col}-lhs") != 
pc.field(f"{col}-rhs") for col in non_key_cols])
+
+    return (
+        source_table
+        # We already know that the schema is compatible, this is to fix large_ 
types
+        .cast(target_table.schema)
+        .join(target_table, keys=list(join_cols_set), join_type="inner", 
left_suffix="-lhs", right_suffix="-rhs")
+        .filter(diff_expr)
+        .drop_columns([f"{col}-rhs" for col in non_key_cols])
+        .rename_columns({f"{col}-lhs" if col not in join_cols else col: col 
for col in source_table.column_names})

Review Comment:
   Yes, only the non-join columns get postfixed :)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Use a join for upsert deduplication [iceberg-python]

Reply via email to