[ https://issues.apache.org/jira/browse/YUNIKORN-575?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Weiwei Yang resolved YUNIKORN-575. ---------------------------------- Fix Version/s: 0.10 Resolution: Fixed > Skip creating placeholders for the completed apps post restart > -------------------------------------------------------------- > > Key: YUNIKORN-575 > URL: https://issues.apache.org/jira/browse/YUNIKORN-575 > Project: Apache YuniKorn > Issue Type: Sub-task > Components: shim - kubernetes > Affects Versions: 0.10 > Reporter: Ayub Pathan > Assignee: Weiwei Yang > Priority: Major > Labels: pull-request-available > Fix For: 0.10 > > Attachments: Screen Shot 2021-03-15 at 9.27.10 PM.png, yk_recover.log > > > * Post restart, YK tries to recover the completed apps and schedules > placeholder pods(even though the real pods are in completed state), which may > not be needed. This leads to resource mismanagement. > {noformat} > gang-app-timeout-1006-5jqqk 0/1 Completed 0 69m > gang-app-timeout-1007-tw44t 0/1 Completed 0 66m > gang-app-timeout-1008-dmzc4 0/1 Completed 0 64m > gang-app-timeout-1008-dwxgq 0/1 Completed 0 64m > gang-app-timeout-1008-sl2x9 0/1 Completed 0 64m > tg-timeout-1006-gang-app-timeout-1006-0 1/1 Running 0 60s > tg-timeout-1006-gang-app-timeout-1006-1 1/1 Running 0 60s > tg-timeout-1006-gang-app-timeout-1006-2 1/1 Running 0 60s > tg-timeout-1007-gang-app-timeout-1007-0 1/1 Running 0 60s > tg-timeout-1007-gang-app-timeout-1007-1 1/1 Running 0 60s > tg-timeout-1007-gang-app-timeout-1007-2 0/1 Pending 0 60s > tg-timeout-1008-gang-app-timeout-1008-0 1/1 Running 0 60s > tg-timeout-1008-gang-app-timeout-1008-1 1/1 Running 0 60s > tg-timeout-1008-gang-app-timeout-1008-2 1/1 Running 0 60s > {noformat} > * *All the completed apps are marked as failed, post restart and the > allocations are not released. This could be a resource leak post restart.* > {noformat} > [ > { > "allocations": null, > "applicationID": "gang-app-timeout-1009", > "applicationState": "Accepted", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868052062417676, > "usedResource": "[]" > }, > { > "allocations": null, > "applicationID": "gang-app-timeout-1011", > "applicationState": "Accepted", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868052062788287, > "usedResource": "[]" > }, > { > "allocations": null, > "applicationID": "gang-app-timeout-1010", > "applicationState": "Accepted", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868052057156621, > "usedResource": "[]" > }, > { > "allocations": null, > "applicationID": "gang-app-timeout-1003", > "applicationState": "Accepted", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868052062023562, > "usedResource": "[]" > }, > { > "allocations": [ > { > "allocationKey": "0a761a05-4b00-4e34-a54d-22411007553a", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1008", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1008-gang-app-timeout-1008-0" > }, > "applicationId": "gang-app-timeout-1008", > "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "9704811c-422d-4efa-bb42-ab565fb5f16b" > }, > { > "allocationKey": "2505258b-3358-4143-b2a2-9084ffa0977b", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1008", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1008-gang-app-timeout-1008-1" > }, > "applicationId": "gang-app-timeout-1008", > "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "e0ff467d-ec18-4d5b-b981-861835f1604a" > }, > { > "allocationKey": "29dbfaec-7632-4bff-b4ea-e313521497f1", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1008", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1008-gang-app-timeout-1008-2" > }, > "applicationId": "gang-app-timeout-1008", > "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "6723d3ac-c7c8-4935-bb23-3b443909a252" > } > ], > "applicationID": "gang-app-timeout-1008", > "applicationState": "Failed", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868050004448061, > "usedResource": "[]" > }, > { > "allocations": [ > { > "allocationKey": "05d87d17-a6dc-4bc0-b495-c76f1cd0a3cb", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1007", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1007-gang-app-timeout-1007-0" > }, > "applicationId": "gang-app-timeout-1007", > "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "67401008-61b0-4957-8361-6d0e8917c21f" > }, > { > "allocationKey": "1af95692-0186-44fe-b712-30edb51b85c2", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1007", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1007-gang-app-timeout-1007-1" > }, > "applicationId": "gang-app-timeout-1007", > "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "5d1f129e-3e40-4103-b2e6-53daf408465f" > } > ], > "applicationID": "gang-app-timeout-1007", > "applicationState": "Failed", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868050004840460, > "usedResource": "[]" > }, > { > "allocations": [ > { > "allocationKey": "8524d2ab-a591-4fca-8a5f-3847e8d173ab", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1006", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1006-gang-app-timeout-1006-1" > }, > "applicationId": "gang-app-timeout-1006", > "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "909735f0-607b-4799-bf4c-8b45f59c174b" > }, > { > "allocationKey": "b33078a1-aac6-4217-afd5-3c80248782dd", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1006", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1006-gang-app-timeout-1006-2" > }, > "applicationId": "gang-app-timeout-1006", > "nodeId": "ip-10-192-131-213.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "80f04647-ada2-4851-9361-d6bcb5c18c65" > }, > { > "allocationKey": "e7aa1b09-fac8-43bf-aae9-48215086ae36", > "allocationTags": { > "kubernetes.io/label/applicationId": > "gang-app-timeout-1006", > "kubernetes.io/label/queue": "fifo", > "kubernetes.io/meta/namespace": "fifo", > "kubernetes.io/meta/podName": > "tg-timeout-1006-gang-app-timeout-1006-0" > }, > "applicationId": "gang-app-timeout-1006", > "nodeId": "ip-10-192-142-84.ca-central-1.compute.internal", > "partition": "default", > "priority": "0", > "queueName": "root.fifo", > "resource": "[memory:300 vcore:300]", > "uuid": "f6172318-7e4a-4252-8bf5-8346de4a4d48" > } > ], > "applicationID": "gang-app-timeout-1006", > "applicationState": "Failed", > "partition": "[mycluster]default", > "queueName": "root.fifo", > "submissionTime": 1615868050003595376, > "usedResource": "[]" > } > ] > {noformat} > YK UI snapshot showing apps marked as failed. > !image-2021-03-15-21-37-56-129.png|thumbnail! > Attached log. [^yk_recover.log] -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@yunikorn.apache.org For additional commands, e-mail: issues-h...@yunikorn.apache.org