[
https://issues.apache.org/jira/browse/FLINK-4485?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15436917#comment-15436917
]
Niels Basjes commented on FLINK-4485:
-------------------------------------
I just reproduced the effect on a non-secure Yarn cluster.
After having run a few jobs I see this on the node where the jobmanager runs:
{code}
[root@node1 ~]# lsof | fgrep '/tmp/blobStore'
java 15358 yarn mem REG 8,3 70243224
25936270
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java 15358 yarn DEL REG 8,3
25936269
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027
java 15358 yarn DEL REG 8,3
25936268
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026
java 15358 yarn DEL REG 8,3
25936267
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025
java 15358 yarn DEL REG 8,3
25936266
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024
java 15358 yarn DEL REG 8,3
25936265
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023
java 15358 yarn DEL REG 8,3
25936264
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022
java 15358 yarn DEL REG 8,3
25936263
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021
java 15358 yarn DEL REG 8,3
25936258
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020
java 15358 yarn DEL REG 8,3
25936257
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019
java 15358 yarn DEL REG 8,3
25936260
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018
java 15358 yarn DEL REG 8,3
25936259
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017
java 15358 yarn DEL REG 8,3
25936256
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016
java 15358 yarn DEL REG 8,3
25936255
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015
java 15358 yarn DEL REG 8,3
25936254
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014
java 15358 yarn DEL REG 8,3
25936253
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013
java 15358 yarn DEL REG 8,3
25936252
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012
java 15358 yarn DEL REG 8,3
25936251
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011
java 15358 yarn DEL REG 8,3
25936250
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010
java 15358 yarn DEL REG 8,3
25936249
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009
java 15358 yarn DEL REG 8,3
25936248
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008
java 15358 yarn DEL REG 8,3
25936247
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007
java 15358 yarn DEL REG 8,3
25936246
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006
java 15358 yarn DEL REG 8,3
25936244
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005
java 15358 yarn DEL REG 8,3
25936222
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004
java 15358 yarn DEL REG 8,3
25936221
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003
java 15358 yarn DEL REG 8,3
25936220
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002
java 15358 yarn DEL REG 8,3
25936215
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001
java 15358 yarn 422r REG 8,3 70243224
25936222
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000004
(deleted)
java 15358 yarn 581u REG 8,3 70243224
25936265
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000023
(deleted)
java 15358 yarn 582u REG 8,3 70243224
25936267
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000025
(deleted)
java 15358 yarn 583r REG 8,3 70243224
25936246
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000006
(deleted)
java 15358 yarn 584r REG 8,3 70243224
25936215
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000001
(deleted)
java 15358 yarn 590u REG 8,3 70243224
25936266
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000024
(deleted)
java 15358 yarn 591r REG 8,3 70243224
25936220
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000002
(deleted)
java 15358 yarn 593r REG 8,3 70243224
25936221
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000003
(deleted)
java 15358 yarn 594u REG 8,3 70243224
25936268
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000026
(deleted)
java 15358 yarn 595u REG 8,3 70243224
25936270
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java 15358 yarn 597r REG 8,3 70243224
25936255
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000015
(deleted)
java 15358 yarn 598u REG 8,3 70243224
25936269
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000027
(deleted)
java 15358 yarn 599r REG 8,3 70243224
25936252
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000012
(deleted)
java 15358 yarn 600r REG 8,3 70243224
25936250
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000010
(deleted)
java 15358 yarn 601r REG 8,3 70243224
25936254
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000014
(deleted)
java 15358 yarn 602r REG 8,3 70243224
25936244
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000005
(deleted)
java 15358 yarn 603r REG 8,3 70243224
25936259
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000017
(deleted)
java 15358 yarn 604r REG 8,3 70243224
25936248
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000008
(deleted)
java 15358 yarn 605r REG 8,3 70243224
25936260
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000018
(deleted)
java 15358 yarn 607r REG 8,3 70243224
25936257
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000019
(deleted)
java 15358 yarn 608r REG 8,3 70243224
25936258
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000020
(deleted)
java 15358 yarn 609r REG 8,3 70243224
25936263
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000021
(deleted)
java 15358 yarn 610r REG 8,3 70243224
25936264
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000022
(deleted)
java 15358 yarn 613r REG 8,3 70243224
25936247
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000007
(deleted)
java 15358 yarn 617r REG 8,3 70243224
25936253
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000013
(deleted)
java 15358 yarn 618r REG 8,3 70243224
25936251
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000011
(deleted)
java 15358 yarn 619r REG 8,3 70243224
25936249
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000009
(deleted)
java 15358 yarn 631r REG 8,3 70243224
25936256
/tmp/blobStore-0864a537-f6fa-4b27-9b7f-8cb5a3722c3e/incoming/temp-00000016
(deleted)
java 15454 yarn mem REG 8,3 70243224
25936219
/tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
java 15454 yarn 490r REG 8,3 70243224
25936219
/tmp/blobStore-087a0b08-ee59-4d21-8523-c78a79984a4a/cache/blob_501262b25ff9158ff07ee1f4264b5e3afeaaf69f
{code}
The two process ids you see here are:
{code}yarn 15358 4.9 0.3 1362160 431128 ? Sl 15:24 1:52 |
\_ /usr/lib/jvm/jre/bin/java -Xmx424M
-Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000001/jobmanager.log
-Dlogback.configurationFile=file:logback.xml
-Dlog4j.configuration=file:log4j.properties
org.apache.flink.yarn.YarnApplicationMasterRunner{code}
{code}yarn 15454 10.1 0.6 1306404 801228 ? Sl 15:24 3:51
\_ /usr/lib/jvm/jre/bin/java -Xms424m -Xmx424m -XX:MaxDirectMemorySize=424m
-Dlog.file=/var/log/hadoop-yarn/containers/application_1464009968005_2639/container_1464009968005_2639_01_000002/taskmanager.log
-Dlogback.configurationFile=file:./logback.xml
-Dlog4j.configuration=file:./log4j.properties
org.apache.flink.yarn.YarnTaskManager --configDir .{code}
> Finished jobs in yarn session fill /tmp filesystem
> --------------------------------------------------
>
> Key: FLINK-4485
> URL: https://issues.apache.org/jira/browse/FLINK-4485
> Project: Flink
> Issue Type: Bug
> Components: JobManager
> Affects Versions: 1.1.0
> Reporter: Niels Basjes
> Priority: Blocker
>
> On a Yarn cluster I start a yarn-session with a few containers and task slots.
> Then I fire a 'large' number of Flink batch jobs in sequence against this
> yarn session. It is the exact same job (java code) yet it gets different
> parameters.
> In this scenario it is exporting HBase tables to files in HDFS and the
> parameters are about which data from which tables and the name of the target
> directory.
> After running several dozen jobs the jobs submission started to fail and we
> investigated.
> We found that the cause was that on the Yarn node which was hosting the
> jobmanager the /tmp file system was full (4GB was 100% full).
> How ever the output of {{du -hcs /tmp}} showed only 200MB in use.
> We found that a very large file (we guess it is the jar of the job) was put
> in /tmp , used, deleted yet the file handle was not closed by the jobmanager.
> As soon as we killed the jobmanager the disk space was freed.
> The summary of the impact of this is that a yarn-session that receives enough
> jobs brings down the Yarn node for all users.
> See parts of the output we got from {{lsof}} below.
> {code}
> COMMAND PID USER FD TYPE DEVICE SIZE
> NODE NAME
> java 15034 nbasjes 550r REG 253,17 66219695
> 245
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000003
> (deleted)
> java 15034 nbasjes 551r REG 253,17 66219695
> 252
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000007
> (deleted)
> java 15034 nbasjes 552r REG 253,17 66219695
> 267
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000012
> (deleted)
> java 15034 nbasjes 553r REG 253,17 66219695
> 250
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000005
> (deleted)
> java 15034 nbasjes 554r REG 253,17 66219695
> 288
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000018
> (deleted)
> java 15034 nbasjes 555r REG 253,17 66219695
> 298
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000025
> (deleted)
> java 15034 nbasjes 557r REG 253,17 66219695
> 254
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000008
> (deleted)
> java 15034 nbasjes 558r REG 253,17 66219695
> 292
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000019
> (deleted)
> java 15034 nbasjes 559r REG 253,17 66219695
> 275
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000013
> (deleted)
> java 15034 nbasjes 560r REG 253,17 66219695
> 159
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000002
> (deleted)
> java 15034 nbasjes 562r REG 253,17 66219695
> 238
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000001
> (deleted)
> java 15034 nbasjes 568r REG 253,17 66219695
> 246
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000004
> (deleted)
> java 15034 nbasjes 569r REG 253,17 66219695
> 255
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000009
> (deleted)
> java 15034 nbasjes 571r REG 253,17 66219695
> 299
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000026
> (deleted)
> java 15034 nbasjes 572r REG 253,17 66219695
> 293
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000020
> (deleted)
> java 15034 nbasjes 574r REG 253,17 66219695
> 256
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000010
> (deleted)
> java 15034 nbasjes 575r REG 253,17 66219695
> 302
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000029
> (deleted)
> java 15034 nbasjes 576r REG 253,17 66219695
> 294
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000021
> (deleted)
> java 15034 nbasjes 577r REG 253,17 66219695
> 262
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000011
> (deleted)
> java 15034 nbasjes 578r REG 253,17 66219695
> 251
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000006
> (deleted)
> java 15034 nbasjes 580r REG 253,17 66219695
> 295
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000022
> (deleted)
> java 15034 nbasjes 581r REG 253,17 66219695
> 300
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000027
> (deleted)
> java 15034 nbasjes 582r REG 253,17 66219695
> 188
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/cache/blob_e318d1698aa6e7dc91e5f4a9f8ba29781aebd8c4
> (deleted)
> java 15034 nbasjes 585r REG 253,17 66219695
> 279
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000014
> (deleted)
> java 15034 nbasjes 586r REG 253,17 66219695
> 296
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000023
> (deleted)
> java 15034 nbasjes 588r REG 253,17 66219695
> 301
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000028
> (deleted)
> java 15034 nbasjes 589r REG 253,17 66219695
> 297
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000024
> (deleted)
> java 15034 nbasjes 598r REG 253,17 66219695
> 280
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000015
> (deleted)
> java 15034 nbasjes 601r REG 253,17 66219695
> 289
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000016
> (deleted)
> java 15034 nbasjes 604r REG 253,17 66219695
> 284
> /tmp/blobStore-fbe9c4cf-1f85-48cb-aad9-180e8d4ec7ce/incoming/temp-00000017
> (deleted)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)