SanJSp commented on code in PR #55583:
URL: https://github.com/apache/spark/pull/55583#discussion_r3161085592
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveChangelogTable.scala:
##########
@@ -277,13 +294,174 @@ object ResolveChangelogTable extends Rule[LogicalPlan] {
//
---------------------------------------------------------------------------
/**
- * Collapses multiple changes per row identity into the net effect.
- * Not yet implemented.
+ * Collapses multiple changes per row identity into the net effect:
+ *
+ * | existedBefore | existsAfter | output |
+ * |---------------|-------------|-------------------------------------|
+ * | false | false | (cancel) |
+ * | false | true | insert |
+ * | true | false | delete |
+ * | true | true | update_preimage + update_postimage |
+ *
+ * If `computeUpdates = false`, the `update_preimage` + `update_postimage`
pair is
+ * emitted as `delete` + `insert` instead.
+ *
+ * `existedBefore` is true iff the partition's first event is `delete` or
+ * `update_preimage`. `existsAfter` is true iff the partition's last event is
+ * `insert` or `update_postimage`.
+ *
+ * Pipeline: Window (per-rowId aggregates) -> Filter (keep first/last per
partition)
+ * -> Project (relabel `_change_type` and drop helper columns).
*/
private def injectNetChangeComputation(
plan: LogicalPlan,
- cl: Changelog): LogicalPlan = {
- plan
+ cl: Changelog,
+ computeUpdates: Boolean): LogicalPlan = {
+ val windowedPlan = addNetChangesWindow(plan, cl)
+ val filteredAndRelabeledPlan =
+ removeIntermediateChangelogEntriesAndRelabelChangeTypes(windowedPlan,
computeUpdates)
+ filteredAndRelabeledPlan
+ }
+
+ /**
+ * Adds a Window node partitioned by `rowId` and ordered by
+ * `(_commit_version, change_type_rank)` where pre-events (`update_preimage`,
+ * `delete`) sort before post-events (`update_postimage`, `insert`) within
the same
+ * commit. Computes per-partition helper columns:
+ * - `__spark_cdc_row_number` (1..n) answers: "is this the first or last
row?".
+ * - `__spark_cdc_row_count` is the partition size which combined with
row_number is
+ * used to detect the last row.
+ * - `__spark_cdc_first_row_change_type_value` and
+ * `__spark_cdc_last_row_change_type_value` drive the first/last
classification at
+ * filter and relabel time.
+ */
+ private def addNetChangesWindow(plan: LogicalPlan, cl: Changelog):
LogicalPlan = {
+ val changeTypeAttr = getAttribute(plan, "_change_type")
+ val rowIdExprs =
V2ExpressionUtils.resolveRefs[NamedExpression](cl.rowId().toSeq, plan)
+ val commitVersionAttr = getAttribute(plan, "_commit_version")
+ val changeTypeRank = CaseWhen(Seq(
+ EqualTo(changeTypeAttr, Literal(Changelog.CHANGE_TYPE_UPDATE_PREIMAGE))
-> Literal(0),
+ EqualTo(changeTypeAttr, Literal(Changelog.CHANGE_TYPE_DELETE)) ->
Literal(0),
+ EqualTo(changeTypeAttr, Literal(Changelog.CHANGE_TYPE_INSERT)) ->
Literal(1),
+ EqualTo(changeTypeAttr, Literal(Changelog.CHANGE_TYPE_UPDATE_POSTIMAGE))
-> Literal(1)),
+ Literal(2))
Review Comment:
Done. Added a new error subclass
`CHANGELOG_CONTRACT_VIOLATION.UNEXPECTED_CHANGE_TYPE` and use `RaiseError` as
the `changeTypeRank` else branch.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]