This is an automated email from the ASF dual-hosted git repository.

wilfreds pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git


The following commit(s) were added to refs/heads/master by this push:
     new 70dbc45d [YUNIKORN-2025] Mismatched running container count (#675)
70dbc45d is described below

commit 70dbc45d0e64ef6f452bfa5e2f8ec95f7badcb07
Author: Yu-Lin Chen <[email protected]>
AuthorDate: Wed Oct 18 13:29:41 2023 +1100

    [YUNIKORN-2025] Mismatched running container count (#675)
    
    Preempted containers are counted twice as released. This causes the
    running container count to be incorrect when preempting containers.
    
    Handle PREEMPTED_BY_SCHEDULER just like TIMEOUT: do not send the release
    back to the shim in removeAllocation. The removal was triggered in the
    by the core in the first place.
    
    Closes: #675
    
    Signed-off-by: Wilfred Spiegelenburg <[email protected]>
---
 pkg/scheduler/partition.go      | 6 +++---
 pkg/scheduler/partition_test.go | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go
index f08558d1..dcfcc116 100644
--- a/pkg/scheduler/partition.go
+++ b/pkg/scheduler/partition.go
@@ -1370,9 +1370,9 @@ func (pc *PartitionContext) removeAllocation(release 
*si.AllocationRelease) ([]*
        pc.updateAllocationCount(-len(released))
        
metrics.GetQueueMetrics(queue.GetQueuePath()).AddReleasedContainers(len(released))
 
-       // if the termination type is timeout, we don't notify the shim, 
because it's
-       // originated from that side
-       if release.TerminationType == si.TerminationType_TIMEOUT {
+       // if the termination type is TIMEOUT/PREEMPTED_BY_SCHEDULER, we don't 
notify the shim,
+       // because it's originated from that side
+       if release.TerminationType == si.TerminationType_TIMEOUT || 
release.TerminationType == si.TerminationType_PREEMPTED_BY_SCHEDULER {
                released = nil
        }
        return released, confirmed
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index 59e91f86..bc3f411d 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -563,7 +563,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t 
*testing.T) {
                TerminationType: si.TerminationType_PREEMPTED_BY_SCHEDULER,
        }
        releases, _ := partition.removeAllocation(release)
-       assert.Equal(t, 1, len(releases), "unexpected number of allocations 
released")
+       assert.Equal(t, 0, len(releases), "not expecting any released 
allocations")
        assertPlaceholderData(t, gangApp, 7, 1)
 }
 
@@ -2120,7 +2120,7 @@ func setupPreemptionForRequiredNode(t *testing.T) 
(*PartitionContext, *objects.A
                TerminationType: si.TerminationType_PREEMPTED_BY_SCHEDULER,
        }
        releases, _ := partition.removeAllocation(release)
-       assert.Equal(t, 1, len(releases), "unexpected number of allocations 
released")
+       assert.Equal(t, 0, len(releases), "not expecting any released 
allocations")
        assertUserGroupResourceMaxLimits(t, getTestUserGroup(), 
resources.NewResourceFromMap(map[string]resources.Quantity{"vcore": 0}), 
getExpectedQueuesLimitsForPreemptionWithRequiredNode())
        return partition, app
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to