[ https://issues.apache.org/jira/browse/YUNIKORN-1632?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Craig Condit resolved YUNIKORN-1632. ------------------------------------ Fix Version/s: 1.3.0 Target Version: 1.3.0 Resolution: Fixed Merged to master. > Yunikorn fails to account for the max number of pods on a node > -------------------------------------------------------------- > > Key: YUNIKORN-1632 > URL: https://issues.apache.org/jira/browse/YUNIKORN-1632 > Project: Apache YuniKorn > Issue Type: Bug > Reporter: Eli Schiff > Assignee: Eli Schiff > Priority: Major > Fix For: 1.3.0 > > > In my cluster I am seeing occasional events like this on some of my pods > which causes them to get stuck. > {code:java} > 54m Warning OutOfpods > pod/tg-spark-executor-640b4349263cc74570ae3a1e-0 Node didn't have enough > resource: pods, requested: 1, used: 12, capacity: 12{code} > > It mostly happens when a bunch of small pods get created all at once and all > get assigned to the same node. That node could fit the pods based on > cpu/memory alone. The limiting factor here is the pod limit. > Here is an example of a node when I get a state-dump (note that this node is > not the one that was out of capacity, I could not get the statedump in time. > This is just a random example node.) > Note how the capacity section shows vcore, memory, and pods among others. The > available section subtracts the vcore and memory correctly but pods is still > at 12 which is the same as the capacity even though there are pods on this > node. > {code:java} > { > "nodeID": "node-1", > "hostName": "", > "rackName": "", > "capacity": { > "attachable-volumes-gce-pd": 127, > "ephemeral-storage": 1426128608967, > "hugepages-1Gi": 0, > "hugepages-2Mi": 0, > "memory": 257603937977, > "pods": 12, > "vcore": 31800 > }, > "allocated": { > "memory": 211619414016, > "vcore": 28400 > }, > "occupied": { > "memory": 648019968, > "vcore": 220 > }, > "available": { > "attachable-volumes-gce-pd": 127, > "ephemeral-storage": 1426128608967, > "hugepages-1Gi": 0, > "hugepages-2Mi": 0, > "memory": 45336503993, > "pods": 12, > "vcore": 3180 > }, > "utilized": { > "memory": 82, > "vcore": 89 > }, > "allocations": [ > { > "allocationKey": "44883a88-342f-47ff-ad89-013f420be4a2", > "allocationTags": {REDACTED}, > "requestTime": 1678499763840497296, > "allocationTime": 1678499790893957853, > "allocationDelay": 27053460557, > "uuid": "7097888a-ba67-4eeb-abf4-aba4c8960abc", > "resource": { > "memory": 52904853504, > "vcore": 7100 > }, > "priority": "0", > "nodeId": "node-1", > "applicationId": "640bdfa8dcd4e8dd542d1767", > "partition": "default", > "placeholder": false, > "placeholderUsed": true, > "taskGroupName": "", > "preempted": false > }, > { > "allocationKey": "b30b2d21-8442-43ec-a49c-7f69d4cabe62", > "allocationTags": {REDACTED}, > "requestTime": 1678499763846638434, > "allocationTime": 1678499789888691963, > "allocationDelay": 26042053529, > "uuid": "a6077f71-db66-4cb3-b98b-5f86e3323085", > "resource": { > "memory": 52904853504, > "vcore": 7100 > }, > "priority": "0", > "nodeId": "node-1", > "applicationId": "640bdfa8dcd4e8dd542d1767", > "partition": "default", > "placeholder": false, > "placeholderUsed": true, > "taskGroupName": "", > "preempted": false > }, > { > "allocationKey": "fb5a3169-7d68-4bfd-9009-4b7373fd5daf", > "allocationTags": {REDACTED}, > "requestTime": 1678499763852070372, > "allocationTime": 1678499790930270840, > "allocationDelay": 27078200468, > "uuid": "a593c7a7-4434-44a0-b144-f8e29232162c", > "resource": { > "memory": 52904853504, > "vcore": 7100 > }, > "priority": "0", > "nodeId": "node-1", > "applicationId": "640bdfa8dcd4e8dd542d1767", > "partition": "default", > "placeholder": false, > "placeholderUsed": true, > "taskGroupName": "", > "preempted": false > }, > { > "allocationKey": "c1066dbf-94c9-414b-8bd8-d686cfafa554", > "allocationTags": {REDACTED}, > "requestTime": 1678499893886008831, > "allocationTime": 1678499893889194430, > "allocationDelay": 3185599, > "uuid": "51e5fd09-6224-4c63-ba58-d0be8c6a9dee", > "resource": { > "memory": 52904853504, > "vcore": 7100 > }, > "priority": "0", > "nodeId": "node-1", > "applicationId": "640bdfa7ea72a327e004222b", > "partition": "default", > "placeholder": false, > "placeholderUsed": false, > "taskGroupName": "", > "preempted": false > } > ], > "schedulable": true, > "isReserved": false, > "reservations": [] > },{code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@yunikorn.apache.org For additional commands, e-mail: dev-h...@yunikorn.apache.org