[ https://issues.apache.org/jira/browse/FLINK-4485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15436917#comment-15436917 ]
Niels Basjes commented on FLINK-4485: ------------------------------------- I just reproduced the effect on a non-secure Yarn cluster. After having run a few jobs I see this on the node where the jobmanager runs: {code} [root@node1 ~]# lsof | fgrep '/tmp/blobStore' java 15358 yarn mem REG 8,3 70243224 25936270 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f java 15358 yarn DEL REG 8,3 25936269 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027 java 15358 yarn DEL REG 8,3 25936268 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026 java 15358 yarn DEL REG 8,3 25936267 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025 java 15358 yarn DEL REG 8,3 25936266 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024 java 15358 yarn DEL REG 8,3 25936265 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023 java 15358 yarn DEL REG 8,3 25936264 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022 java 15358 yarn DEL REG 8,3 25936263 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021 java 15358 yarn DEL REG 8,3 25936258 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020 java 15358 yarn DEL REG 8,3 25936257 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019 java 15358 yarn DEL REG 8,3 25936260 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018 java 15358 yarn DEL REG 8,3 25936259 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017 java 15358 yarn DEL REG 8,3 25936256 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016 java 15358 yarn DEL REG 8,3 25936255 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015 java 15358 yarn DEL REG 8,3 25936254 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014 java 15358 yarn DEL REG 8,3 25936253 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013 java 15358 yarn DEL REG 8,3 25936252 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012 java 15358 yarn DEL REG 8,3 25936251 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011 java 15358 yarn DEL REG 8,3 25936250 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010 java 15358 yarn DEL REG 8,3 25936249 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009 java 15358 yarn DEL REG 8,3 25936248 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008 java 15358 yarn DEL REG 8,3 25936247 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007 java 15358 yarn DEL REG 8,3 25936246 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006 java 15358 yarn DEL REG 8,3 25936244 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005 java 15358 yarn DEL REG 8,3 25936222 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004 java 15358 yarn DEL REG 8,3 25936221 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003 java 15358 yarn DEL REG 8,3 25936220 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002 java 15358 yarn DEL REG 8,3 25936215 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001 java 15358 yarn 422r REG 8,3 70243224 25936222 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004 (deleted) java 15358 yarn 581u REG 8,3 70243224 25936265 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023 (deleted) java 15358 yarn 582u REG 8,3 70243224 25936267 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025 (deleted) java 15358 yarn 583r REG 8,3 70243224 25936246 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006 (deleted) java 15358 yarn 584r REG 8,3 70243224 25936215 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001 (deleted) java 15358 yarn 590u REG 8,3 70243224 25936266 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024 (deleted) java 15358 yarn 591r REG 8,3 70243224 25936220 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002 (deleted) java 15358 yarn 593r REG 8,3 70243224 25936221 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003 (deleted) java 15358 yarn 594u REG 8,3 70243224 25936268 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026 (deleted) java 15358 yarn 595u REG 8,3 70243224 25936270 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f java 15358 yarn 597r REG 8,3 70243224 25936255 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015 (deleted) java 15358 yarn 598u REG 8,3 70243224 25936269 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027 (deleted) java 15358 yarn 599r REG 8,3 70243224 25936252 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012 (deleted) java 15358 yarn 600r REG 8,3 70243224 25936250 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010 (deleted) java 15358 yarn 601r REG 8,3 70243224 25936254 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014 (deleted) java 15358 yarn 602r REG 8,3 70243224 25936244 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005 (deleted) java 15358 yarn 603r REG 8,3 70243224 25936259 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017 (deleted) java 15358 yarn 604r REG 8,3 70243224 25936248 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008 (deleted) java 15358 yarn 605r REG 8,3 70243224 25936260 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018 (deleted) java 15358 yarn 607r REG 8,3 70243224 25936257 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019 (deleted) java 15358 yarn 608r REG 8,3 70243224 25936258 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020 (deleted) java 15358 yarn 609r REG 8,3 70243224 25936263 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021 (deleted) java 15358 yarn 610r REG 8,3 70243224 25936264 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022 (deleted) java 15358 yarn 613r REG 8,3 70243224 25936247 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007 (deleted) java 15358 yarn 617r REG 8,3 70243224 25936253 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013 (deleted) java 15358 yarn 618r REG 8,3 70243224 25936251 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011 (deleted) java 15358 yarn 619r REG 8,3 70243224 25936249 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009 (deleted) java 15358 yarn 631r REG 8,3 70243224 25936256 /tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016 (deleted) java 15454 yarn mem REG 8,3 70243224 25936219 /tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f java 15454 yarn 490r REG 8,3 70243224 25936219 /tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f {code} The two process ids you see here are: {code}yarn 15358 4.9 0.3 1362160 431128 ? Sl 15:24 1:52 | \_ /usr/lib/jvm/jre/bin/java -Xmx424M -Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000001/jobmanager.log -Dlogback.configurationFile=file:logback.xml -Dlog4j.configuration=file:log4j.properties org.apache.flink.yarn.YarnApplicationMasterRunner{code} {code}yarn 15454 10.1 0.6 1306404 801228 ? Sl 15:24 3:51 \_ /usr/lib/jvm/jre/bin/java -Xms424m -Xmx424m -XX:MaxDirectMemorySize=424m -Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000002/taskmanager.log -Dlogback.configurationFile=file:./logback.xml -Dlog4j.configuration=file:./log4j.properties org.apache.flink.yarn.YarnTaskManager --configDir .{code} > Finished jobs in yarn session fill /tmp filesystem > -------------------------------------------------- > > Key: FLINK-4485 > URL: https://issues.apache.org/jira/browse/FLINK-4485 > Project: Flink > Issue Type: Bug > Components: JobManager > Affects Versions: 1.1.0 > Reporter: Niels Basjes > Priority: Blocker > > On a Yarn cluster I start a yarn-session with a few containers and task slots. > Then I fire a 'large' number of Flink batch jobs in sequence against this > yarn session. It is the exact same job (java code) yet it gets different > parameters. > In this scenario it is exporting HBase tables to files in HDFS and the > parameters are about which data from which tables and the name of the target > directory. > After running several dozen jobs the jobs submission started to fail and we > investigated. > We found that the cause was that on the Yarn node which was hosting the > jobmanager the /tmp file system was full (4GB was 100% full). > How ever the output of {{du -hcs /tmp}} showed only 200MB in use. > We found that a very large file (we guess it is the jar of the job) was put > in /tmp , used, deleted yet the file handle was not closed by the jobmanager. > As soon as we killed the jobmanager the disk space was freed. > The summary of the impact of this is that a yarn-session that receives enough > jobs brings down the Yarn node for all users. > See parts of the output we got from {{lsof}} below. > {code} > COMMAND PID USER FD TYPE DEVICE SIZE > NODE NAME > java 15034 nbasjes 550r REG 253,17 66219695 > 245 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000003 > (deleted) > java 15034 nbasjes 551r REG 253,17 66219695 > 252 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000007 > (deleted) > java 15034 nbasjes 552r REG 253,17 66219695 > 267 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000012 > (deleted) > java 15034 nbasjes 553r REG 253,17 66219695 > 250 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000005 > (deleted) > java 15034 nbasjes 554r REG 253,17 66219695 > 288 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000018 > (deleted) > java 15034 nbasjes 555r REG 253,17 66219695 > 298 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000025 > (deleted) > java 15034 nbasjes 557r REG 253,17 66219695 > 254 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000008 > (deleted) > java 15034 nbasjes 558r REG 253,17 66219695 > 292 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000019 > (deleted) > java 15034 nbasjes 559r REG 253,17 66219695 > 275 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000013 > (deleted) > java 15034 nbasjes 560r REG 253,17 66219695 > 159 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000002 > (deleted) > java 15034 nbasjes 562r REG 253,17 66219695 > 238 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000001 > (deleted) > java 15034 nbasjes 568r REG 253,17 66219695 > 246 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000004 > (deleted) > java 15034 nbasjes 569r REG 253,17 66219695 > 255 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000009 > (deleted) > java 15034 nbasjes 571r REG 253,17 66219695 > 299 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000026 > (deleted) > java 15034 nbasjes 572r REG 253,17 66219695 > 293 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000020 > (deleted) > java 15034 nbasjes 574r REG 253,17 66219695 > 256 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000010 > (deleted) > java 15034 nbasjes 575r REG 253,17 66219695 > 302 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000029 > (deleted) > java 15034 nbasjes 576r REG 253,17 66219695 > 294 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000021 > (deleted) > java 15034 nbasjes 577r REG 253,17 66219695 > 262 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000011 > (deleted) > java 15034 nbasjes 578r REG 253,17 66219695 > 251 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000006 > (deleted) > java 15034 nbasjes 580r REG 253,17 66219695 > 295 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000022 > (deleted) > java 15034 nbasjes 581r REG 253,17 66219695 > 300 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000027 > (deleted) > java 15034 nbasjes 582r REG 253,17 66219695 > 188 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/cache/blob_e318d1698aa6e7dc91e5f4a9f8ba29781aebd8c4 > (deleted) > java 15034 nbasjes 585r REG 253,17 66219695 > 279 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000014 > (deleted) > java 15034 nbasjes 586r REG 253,17 66219695 > 296 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000023 > (deleted) > java 15034 nbasjes 588r REG 253,17 66219695 > 301 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000028 > (deleted) > java 15034 nbasjes 589r REG 253,17 66219695 > 297 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000024 > (deleted) > java 15034 nbasjes 598r REG 253,17 66219695 > 280 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000015 > (deleted) > java 15034 nbasjes 601r REG 253,17 66219695 > 289 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000016 > (deleted) > java 15034 nbasjes 604r REG 253,17 66219695 > 284 > /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000017 > (deleted) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)