This is an automated email from the ASF dual-hosted git repository.
wilfreds pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 70dbc45d [YUNIKORN-2025] Mismatched running container count (#675)
70dbc45d is described below
commit 70dbc45d0e64ef6f452bfa5e2f8ec95f7badcb07
Author: Yu-Lin Chen <[email protected]>
AuthorDate: Wed Oct 18 13:29:41 2023 +1100
[YUNIKORN-2025] Mismatched running container count (#675)
Preempted containers are counted twice as released. This causes the
running container count to be incorrect when preempting containers.
Handle PREEMPTED_BY_SCHEDULER just like TIMEOUT: do not send the release
back to the shim in removeAllocation. The removal was triggered in the
by the core in the first place.
Closes: #675
Signed-off-by: Wilfred Spiegelenburg <[email protected]>
---
pkg/scheduler/partition.go | 6 +++---
pkg/scheduler/partition_test.go | 4 ++--
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go
index f08558d1..dcfcc116 100644
--- a/pkg/scheduler/partition.go
+++ b/pkg/scheduler/partition.go
@@ -1370,9 +1370,9 @@ func (pc *PartitionContext) removeAllocation(release
*si.AllocationRelease) ([]*
pc.updateAllocationCount(-len(released))
metrics.GetQueueMetrics(queue.GetQueuePath()).AddReleasedContainers(len(released))
- // if the termination type is timeout, we don't notify the shim,
because it's
- // originated from that side
- if release.TerminationType == si.TerminationType_TIMEOUT {
+ // if the termination type is TIMEOUT/PREEMPTED_BY_SCHEDULER, we don't
notify the shim,
+ // because it's originated from that side
+ if release.TerminationType == si.TerminationType_TIMEOUT ||
release.TerminationType == si.TerminationType_PREEMPTED_BY_SCHEDULER {
released = nil
}
return released, confirmed
diff --git a/pkg/scheduler/partition_test.go b/pkg/scheduler/partition_test.go
index 59e91f86..bc3f411d 100644
--- a/pkg/scheduler/partition_test.go
+++ b/pkg/scheduler/partition_test.go
@@ -563,7 +563,7 @@ func TestPlaceholderDataWithPlaceholderPreemption(t
*testing.T) {
TerminationType: si.TerminationType_PREEMPTED_BY_SCHEDULER,
}
releases, _ := partition.removeAllocation(release)
- assert.Equal(t, 1, len(releases), "unexpected number of allocations
released")
+ assert.Equal(t, 0, len(releases), "not expecting any released
allocations")
assertPlaceholderData(t, gangApp, 7, 1)
}
@@ -2120,7 +2120,7 @@ func setupPreemptionForRequiredNode(t *testing.T)
(*PartitionContext, *objects.A
TerminationType: si.TerminationType_PREEMPTED_BY_SCHEDULER,
}
releases, _ := partition.removeAllocation(release)
- assert.Equal(t, 1, len(releases), "unexpected number of allocations
released")
+ assert.Equal(t, 0, len(releases), "not expecting any released
allocations")
assertUserGroupResourceMaxLimits(t, getTestUserGroup(),
resources.NewResourceFromMap(map[string]resources.Quantity{"vcore": 0}),
getExpectedQueuesLimitsForPreemptionWithRequiredNode())
return partition, app
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]