lostluck commented on code in PR #36163:
URL: https://github.com/apache/beam/pull/36163#discussion_r2349965073
##########
sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go:
##########
@@ -892,70 +892,75 @@ func (em *ElementManager) PersistBundle(rb RunBundle,
col2Coders map[string]PCol
// Clear out the inprogress elements associated with the completed
bundle.
// Must be done after adding the new pending elements to avoid an
incorrect
// watermark advancement.
- stage.mu.Lock()
- completed := stage.inprogress[rb.BundleID]
- em.addPending(-len(completed.es))
- delete(stage.inprogress, rb.BundleID)
- for k := range stage.inprogressKeysByBundle[rb.BundleID] {
- delete(stage.inprogressKeys, k)
- }
- delete(stage.inprogressKeysByBundle, rb.BundleID)
-
- // Adjust holds as needed.
- for h, c := range newHolds {
- if c > 0 {
- stage.watermarkHolds.Add(h, c)
- } else if c < 0 {
- stage.watermarkHolds.Drop(h, -c)
- }
- }
- for hold, v := range stage.inprogressHoldsByBundle[rb.BundleID] {
- stage.watermarkHolds.Drop(hold, v)
- }
- delete(stage.inprogressHoldsByBundle, rb.BundleID)
-
- // Clean up OnWindowExpiration bundle accounting, so window state
- // may be garbage collected.
- if stage.expiryWindowsByBundles != nil {
- win, ok := stage.expiryWindowsByBundles[rb.BundleID]
- if ok {
- stage.inProgressExpiredWindows[win] -= 1
- if stage.inProgressExpiredWindows[win] == 0 {
- delete(stage.inProgressExpiredWindows, win)
+ func() {
+ stage.mu.Lock()
+ // Defer unlocking the mutex within an anonymous function to
ensure it's released
+ // even if a panic occurs during `em.addPending`. This prevents
potential deadlocks
+ // if the waitgroup unexpectedly drops below zero due to a
runner bug.
+ defer stage.mu.Unlock()
+ completed := stage.inprogress[rb.BundleID]
+ em.addPending(-len(completed.es))
Review Comment:
Hmmm. Avoiding the deadlock is good, but it's definitely something that
should be causing the pipeline to stop, and error out.
I'd recommend, at least adding a log or something to em.addPending, to log
when the livePending count returns <= 0. Then we can at least log that the
subsequent panic indicates a logic error in prism, while not interrupting it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]