This is an automated email from the ASF dual-hosted git repository.

wilfreds pushed a commit to branch branch-1.5
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git

commit 381bc9a8a64d4fac99e37d04de6be4e436adf1a9
Author: Yongjun Zhang <yongjunzh...@pinterest.com>
AuthorDate: Mon Feb 26 20:12:25 2024 +1100

    [YUNIKORN-2030] headroom check for reserved allocations (#793)
    
    During the allocation cycle for reservations we fall back to trying all
    reservations on all nodes for an application. During this fallback a
    headroom check was missing which caused queue update failures to be
    logged.
    The allocation did not succeed but it caused scheduling delays and log
    spew.
    
    Closes: #793
    
    Signed-off-by: Wilfred Spiegelenburg <wilfr...@apache.org>
    (cherry picked from commit 908d1cb07083b989230a9a18bb0b66180122fb73)
---
 pkg/scheduler/objects/application.go      | 23 ++++++++++++---------
 pkg/scheduler/objects/application_test.go | 33 +++++++++++++++++++++++++++++++
 pkg/scheduler/objects/utilities_test.go   |  1 +
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/pkg/scheduler/objects/application.go 
b/pkg/scheduler/objects/application.go
index cde6fbd9..a064d321 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -1223,6 +1223,12 @@ func (sa *Application) 
tryPlaceholderAllocate(nodeIterator func() NodeIterator,
        return allocResult
 }
 
+// check ask against both user headRoom and queue headRoom
+func (sa *Application) checkHeadRooms(ask *AllocationAsk, userHeadroom 
*resources.Resource, headRoom *resources.Resource) bool {
+       // check if this fits in the users' headroom first, if that fits check 
the queues' headroom
+       return userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) && 
headRoom.FitInMaxUndef(ask.GetAllocatedResource())
+}
+
 // Try a reserved allocation of an outstanding reservation
 func (sa *Application) tryReservedAllocate(headRoom *resources.Resource, 
nodeIterator func() NodeIterator) *Allocation {
        sa.Lock()
@@ -1250,13 +1256,8 @@ func (sa *Application) tryReservedAllocate(headRoom 
*resources.Resource, nodeIte
                        alloc := newUnreservedAllocation(reserve.nodeID, 
unreserveAsk)
                        return alloc
                }
-               // check if this fits in the users' headroom first, if that 
fits check the queues' headroom
-               if !userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) {
-                       continue
-               }
 
-               // check if this fits in the queue's headroom
-               if !headRoom.FitInMaxUndef(ask.GetAllocatedResource()) {
+               if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
                        continue
                }
 
@@ -1280,12 +1281,16 @@ func (sa *Application) tryReservedAllocate(headRoom 
*resources.Resource, nodeIte
        // lets try this on all other nodes
        for _, reserve := range sa.reservations {
                // Other nodes cannot be tried if the ask has a required node
-               if reserve.ask.GetRequiredNode() != "" {
+               ask := reserve.ask
+               if ask.GetRequiredNode() != "" {
                        continue
                }
                iterator := nodeIterator()
                if iterator != nil {
-                       alloc := sa.tryNodesNoReserve(reserve.ask, iterator, 
reserve.nodeID)
+                       if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
+                               continue
+                       }
+                       alloc := sa.tryNodesNoReserve(ask, iterator, 
reserve.nodeID)
                        // have a candidate return it, including the node that 
was reserved
                        if alloc != nil {
                                return alloc
@@ -1490,7 +1495,7 @@ func (sa *Application) tryNode(node *Node, ask 
*AllocationAsk) *Allocation {
        alloc := NewAllocation(node.NodeID, ask)
        if node.AddAllocation(alloc) {
                if err := 
sa.queue.IncAllocatedResource(alloc.GetAllocatedResource(), false); err != nil {
-                       log.Log(log.SchedApplication).Warn("queue update failed 
unexpectedly",
+                       log.Log(log.SchedApplication).DPanic("queue update 
failed unexpectedly",
                                zap.Error(err))
                        // revert the node update
                        node.RemoveAllocation(alloc.GetAllocationID())
diff --git a/pkg/scheduler/objects/application_test.go 
b/pkg/scheduler/objects/application_test.go
index bc055ee6..04439672 100644
--- a/pkg/scheduler/objects/application_test.go
+++ b/pkg/scheduler/objects/application_test.go
@@ -2601,6 +2601,39 @@ func TestGetRateLimitedAppLog(t *testing.T) {
        assert.Check(t, l != nil)
 }
 
+func TestTryAllocateWithReservedHeadRoomChecking(t *testing.T) {
+       defer func() {
+               if r := recover(); r != nil {
+                       t.Fatalf("reserved headroom test regression: %v", r)
+               }
+       }()
+
+       res, err := resources.NewResourceFromConf(map[string]string{"memory": 
"2G"})
+       assert.NilError(t, err, "failed to create basic resource")
+       var headRoom *resources.Resource
+       headRoom, err = 
resources.NewResourceFromConf(map[string]string{"memory": "1G"})
+       assert.NilError(t, err, "failed to create basic resource")
+
+       app := newApplication(appID1, "default", "root")
+       ask := newAllocationAsk(aKey, appID1, res)
+       var queue *Queue
+       queue, err = createRootQueue(map[string]string{"memory": "1G"})
+       assert.NilError(t, err, "queue create failed")
+       app.queue = queue
+       err = app.AddAllocationAsk(ask)
+       assert.NilError(t, err, "ask should have been added to app")
+
+       node1 := newNodeRes(nodeID1, res)
+       node2 := newNodeRes(nodeID2, res)
+       // reserve that works
+       err = app.Reserve(node1, ask)
+       assert.NilError(t, err, "reservation should not have failed")
+
+       iter := getNodeIteratorFn(node1, node2)
+       alloc := app.tryReservedAllocate(headRoom, iter)
+       assert.Assert(t, alloc == nil, "Alloc is expected to be nil due to 
insufficient headroom")
+}
+
 func TestUpdateRunnableStatus(t *testing.T) {
        app := newApplication(appID0, "default", "root.unknown")
        assert.Assert(t, app.runnableInQueue)
diff --git a/pkg/scheduler/objects/utilities_test.go 
b/pkg/scheduler/objects/utilities_test.go
index e66e4e98..577e26ff 100644
--- a/pkg/scheduler/objects/utilities_test.go
+++ b/pkg/scheduler/objects/utilities_test.go
@@ -42,6 +42,7 @@ const (
        aKey          = "alloc-1"
        aAllocationID = "alloc-allocationid-1"
        nodeID1       = "node-1"
+       nodeID2       = "node-2"
        instType1     = "itype-1"
 )
 


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@yunikorn.apache.org
For additional commands, e-mail: issues-h...@yunikorn.apache.org

Reply via email to