[ https://issues.apache.org/jira/browse/YUNIKORN-2550?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Peter Bacsko resolved YUNIKORN-2550. ------------------------------------ Fix Version/s: 1.6.0 1.5.1 Target Version: 1.6.0, 1.5.1 Resolution: Fixed > Fix locking in PartitionContext > ------------------------------- > > Key: YUNIKORN-2550 > URL: https://issues.apache.org/jira/browse/YUNIKORN-2550 > Project: Apache YuniKorn > Issue Type: Sub-task > Components: core - common > Reporter: Peter Bacsko > Assignee: Peter Bacsko > Priority: Major > Labels: pull-request-available > Fix For: 1.6.0, 1.5.1 > > > Possible deadlock was detected: > {noformat} > placement.(*AppPlacementManager).initialise { m.Lock() } <<<<< > placement.(*AppPlacementManager).initialise { } } > placement.(*AppPlacementManager).UpdateRules { > log.Log(log.Config).Info("Building new rule list for placement manager") } > scheduler.(*PartitionContext).updatePartitionDetails { err := > pc.placementManager.UpdateRules(conf.PlacementRules) } > scheduler.(*ClusterContext).updateSchedulerConfig { err = > part.updatePartitionDetails(p) } > scheduler.(*ClusterContext).processRMConfigUpdateEvent { err = > cc.updateSchedulerConfig(conf, rmID) } > scheduler.(*Scheduler).handleRMEvent { case *rmevent.RMConfigUpdateEvent: } > scheduler.(*PartitionContext).GetQueue { pc.RLock() } <<<<< > scheduler.(*PartitionContext).GetQueue { func (pc *PartitionContext) > GetQueue(name string) *objects.Queue { } > placement.(*providedRule).placeApplication { // if we cannot create the queue > must exist } > placement.(*AppPlacementManager).PlaceApplication { queueName, err = > checkRule.placeApplication(app, m.queueFn) } > scheduler.(*PartitionContext).AddApplication { err := > pc.getPlacementManager().PlaceApplication(app) } > scheduler.(*ClusterContext).handleRMUpdateApplicationEvent { schedApp := > objects.NewApplication(app, ugi, cc.rmEventHandler, request.RmID) } > scheduler.(*Scheduler).handleRMEvent { case ev := <-s.pendingEvents: } > {noformat} > Lock order is different between {{PartitionContext}} and > {{AppPlacementManager}}. > There's also an interference between {{PartitionContext}} and an > {{Application}} object: > {noformat} > objects.(*Application).SetTerminatedCallback { sa.Lock() } <<<<< > objects.(*Application).SetTerminatedCallback { func (sa *Application) > SetTerminatedCallback(callback func(appID string)) { } > scheduler.(*PartitionContext).AddApplication { > app.SetTerminatedCallback(pc.moveTerminatedApp) } > scheduler.(*ClusterContext).handleRMUpdateApplicationEvent { schedApp := > objects.NewApplication(app, ugi, cc.rmEventHandler, request.RmID) } > scheduler.(*Scheduler).handleRMEvent { case ev := <-s.pendingEvents: } > scheduler.(*PartitionContext).GetNode { pc.RLock() } <<<<< > scheduler.(*PartitionContext).GetNode { func (pc *PartitionContext) > GetNode(nodeID string) *objects.Node { } > objects.(*Application).tryPlaceholderAllocate { // resource usage should not > change anyway between placeholder and real one at this point } > objects.(*Queue).TryPlaceholderAllocate { for _, app := range > sq.sortApplications(true) { } > objects.(*Queue).TryPlaceholderAllocate { for _, child := range > sq.sortQueues() { } > scheduler.(*PartitionContext).tryPlaceholderAllocate { alloc := > pc.root.TryPlaceholderAllocate(pc.GetNodeIterator, pc.GetNode) } > scheduler.(*ClusterContext).schedule { // nothing reserved that can be > allocated try normal allocate } > scheduler.(*Scheduler).MultiStepSchedule { // Note, this sleep only works in > tests. } > tests.TestDupReleasesInGangScheduling { // and it waits for the shim's > confirmation } > {noformat} > There's no need to have a locked access for {{PartitionContext.nodes}}. The > base implementation of {{NodeCollection}} ({{baseNodeCollection}}) is already > internally synchronized. The "nodes" field is set once. Therefore, no locking > is necessary when accessing it. -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@yunikorn.apache.org For additional commands, e-mail: dev-h...@yunikorn.apache.org