[ https://issues.apache.org/jira/browse/MESOS-2546?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14380824#comment-14380824 ]
Sunil Shah commented on MESOS-2546: ----------------------------------- Okay, some more information to help with reproducing this bug. + I'm running a Vagrant single node installation as per [playa-mesos|https://github.com/mesosphere/playa-mesos]. + Mesos package version: 0.21.1-1.1.ubuntu1404 + Marathon package version: 0.8.1-1.0.171.ubuntu1404 + Chronos package version: 2.3.2-0.1.20150207000917.ubuntu1404 (this has the 0.1 second interval) I try to run a single instance of a "sleep 5000" Marathon application but this is unable to deploy. There are no apps running on Chronos. !http://i.imgur.com/iCRjNMy.png! Mesos master state: {code:json} { "activated_slaves": 1, "build_date": "2015-01-09 02:25:56", "build_time": 1420770356, "build_user": "root", "completed_frameworks": [], "deactivated_slaves": 0, "elected_time": 1427305924.02203, "failed_tasks": 0, "finished_tasks": 0, "flags": { "allocation_interval": "1secs", "authenticate": "false", "authenticate_slaves": "false", "authenticators": "crammd5", "framework_sorter": "drf", "help": "false", "initialize_driver_logging": "true", "log_auto_initialize": "true", "log_dir": "/var/log/mesos", "logbufsecs": "0", "logging_level": "INFO", "port": "5050", "quiet": "false", "quorum": "1", "recovery_slave_removal_limit": "100%", "registry": "replicated_log", "registry_fetch_timeout": "1mins", "registry_store_timeout": "5secs", "registry_strict": "false", "root_submissions": "true", "slave_reregister_timeout": "10mins", "user_sorter": "drf", "version": "false", "webui_dir": "/usr/share/mesos/webui", "whitelist": "*", "work_dir": "/var/lib/mesos", "zk": "zk://localhost:2181/mesos", "zk_session_timeout": "10secs" }, "frameworks": [ { "active": true, "checkpoint": false, "completed_tasks": [], "failover_timeout": 604800, "hostname": "mesos.vm", "id": "20150214-011825-16842879-5050-5990-0000", "name": "chronos-2.3.2_mesos-0.20.1-SNAPSHOT", "offered_resources": { "cpus": 0, "disk": 0, "mem": 0 }, "offers": [], "registered_time": 1427305924.17473, "reregistered_time": 1427317980.74858, "resources": { "cpus": 0, "disk": 0, "mem": 0 }, "role": "*", "tasks": [], "unregistered_time": 0, "used_resources": { "cpus": 0, "disk": 0, "mem": 0 }, "user": "root", "webui_url": "" }, { "active": true, "checkpoint": true, "completed_tasks": [ { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.6c3c20aa-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31087-31087]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317854.31803 }, { "state": "TASK_KILLED", "timestamp": 1427317980.24426 } ] }, { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.b20e940c-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31125-31125]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317971.43239 }, { "state": "TASK_KILLED", "timestamp": 1427317980.26088 } ] }, { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.ae79ad7b-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31084-31084]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317965.34085 }, { "state": "TASK_KILLED", "timestamp": 1427317980.31041 } ] } ], "failover_timeout": 604800, "hostname": "mesos.vm", "id": "20140904-220314-16842879-5050-1241-0000", "name": "marathon", "offered_resources": { "cpus": 0, "disk": 0, "mem": 0 }, "offers": [], "registered_time": 1427305924.07617, "reregistered_time": 1427305924.07618, "resources": { "cpus": 2.77555756156289E-17, "disk": 0, "mem": 0 }, "role": "*", "tasks": [], "unregistered_time": 0, "used_resources": { "cpus": 2.77555756156289E-17, "disk": 0, "mem": 0 }, "user": "root", "webui_url": "http://mesos:8080" } ], "git_sha": "2ae1ba91e64f92ec71d327e10e6ba9e8ad5477e8", "git_tag": "0.21.1", "hostname": "mesos.vm", "id": "20150325-175150-16842879-5050-1232", "killed_tasks": 3, "leader": "master@127.0.1.1:5050", "log_dir": "/var/log/mesos", "lost_tasks": 0, "orphan_tasks": [], "pid": "master@127.0.1.1:5050", "slaves": [ { "attributes": {}, "hostname": "10.141.141.10", "id": "20150325-175150-16842879-5050-1232-S0", "pid": "slave(1)@127.0.1.1:5051", "registered_time": 1427305924.94195, "resources": { "cpus": 2, "disk": 34068, "mem": 1000, "ports": "[31000-32000]" } } ], "staged_tasks": 3, "start_time": 1427305910.8905, "started_tasks": 0, "unregistered_frameworks": [], "version": "0.21.1" } {code} Mesos slave state: {code:json} { "attributes": {}, "build_date": "2015-01-09 02:25:56", "build_time": 1420770356, "build_user": "root", "completed_frameworks": [ { "checkpoint": true, "completed_executors": [ { "completed_tasks": [ { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.6c3c20aa-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31087-31087]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317854.31803 }, { "state": "TASK_KILLED", "timestamp": 1427317980.24426 } ] } ], "container": "353d1d50-76c5-42be-bb30-4761738b906e", "directory": "/tmp/mesos/slaves/20150325-175150-16842879-5050-1232-S0/frameworks/20140904-220314-16842879-5050-1241-0000/executors/test-app.6c3c20aa-d333-11e4-8c08-56847afe9799/runs/353d1d50-76c5-42be-bb30-4761738b906e", "id": "test-app.6c3c20aa-d333-11e4-8c08-56847afe9799", "name": "Command Executor (Task: test-app.6c3c20aa-d333-11e4-8c08-56847afe9799) (Command: sh -c 'sleep 5000')", "queued_tasks": [], "resources": { "cpus": 0.1, "disk": 0, "mem": 32 }, "source": "test-app.6c3c20aa-d333-11e4-8c08-56847afe9799", "tasks": [] }, { "completed_tasks": [ { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.ae79ad7b-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31084-31084]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317965.34085 }, { "state": "TASK_KILLED", "timestamp": 1427317980.31041 } ] } ], "container": "a5530e4d-4f66-47a9-88db-9c5b2002b847", "directory": "/tmp/mesos/slaves/20150325-175150-16842879-5050-1232-S0/frameworks/20140904-220314-16842879-5050-1241-0000/executors/test-app.ae79ad7b-d333-11e4-8c08-56847afe9799/runs/a5530e4d-4f66-47a9-88db-9c5b2002b847", "id": "test-app.ae79ad7b-d333-11e4-8c08-56847afe9799", "name": "Command Executor (Task: test-app.ae79ad7b-d333-11e4-8c08-56847afe9799) (Command: sh -c 'sleep 5000')", "queued_tasks": [], "resources": { "cpus": 0.1, "disk": 0, "mem": 32 }, "source": "test-app.ae79ad7b-d333-11e4-8c08-56847afe9799", "tasks": [] }, { "completed_tasks": [ { "executor_id": "", "framework_id": "20140904-220314-16842879-5050-1241-0000", "id": "test-app.b20e940c-d333-11e4-8c08-56847afe9799", "name": "test-app", "resources": { "cpus": 0.1, "disk": 0, "mem": 16, "ports": "[31125-31125]" }, "slave_id": "20150325-175150-16842879-5050-1232-S0", "state": "TASK_KILLED", "statuses": [ { "state": "TASK_RUNNING", "timestamp": 1427317971.43239 }, { "state": "TASK_KILLED", "timestamp": 1427317980.26088 } ] } ], "container": "ee50b560-d9b3-4544-88f1-ad7acda6dbb8", "directory": "/tmp/mesos/slaves/20150325-175150-16842879-5050-1232-S0/frameworks/20140904-220314-16842879-5050-1241-0000/executors/test-app.b20e940c-d333-11e4-8c08-56847afe9799/runs/ee50b560-d9b3-4544-88f1-ad7acda6dbb8", "id": "test-app.b20e940c-d333-11e4-8c08-56847afe9799", "name": "Command Executor (Task: test-app.b20e940c-d333-11e4-8c08-56847afe9799) (Command: sh -c 'sleep 5000')", "queued_tasks": [], "resources": { "cpus": 0.1, "disk": 0, "mem": 32 }, "source": "test-app.b20e940c-d333-11e4-8c08-56847afe9799", "tasks": [] } ], "executors": [], "failover_timeout": 604800, "hostname": "mesos.vm", "id": "20140904-220314-16842879-5050-1241-0000", "name": "marathon", "role": "*", "user": "root" } ], "failed_tasks": 0, "finished_tasks": 0, "flags": { "cgroups_enable_cfs": "true", "cgroups_hierarchy": "/sys/fs/cgroup", "cgroups_limit_swap": "false", "cgroups_root": "mesos", "checkpoint": "true", "containerizers": "docker,mesos", "default_role": "*", "disk_watch_interval": "1mins", "docker": "docker", "docker_remove_delay": "6hrs", "docker_sandbox_directory": "/mnt/mesos/sandbox", "docker_stop_timeout": "0ns", "executor_registration_timeout": "1mins", "executor_shutdown_grace_period": "5secs", "frameworks_home": "", "gc_delay": "1weeks", "hadoop_home": "", "help": "false", "hostname": "10.141.141.10", "initialize_driver_logging": "true", "isolation": "cgroups/cpu,cgroups/mem", "launcher_dir": "/usr/libexec/mesos", "log_dir": "/var/log/mesos", "logbufsecs": "0", "logging_level": "INFO", "master": "zk://localhost:2181/mesos", "perf_duration": "10secs", "perf_interval": "1mins", "port": "5051", "quiet": "false", "recover": "reconnect", "recovery_timeout": "15mins", "registration_backoff_factor": "1secs", "resource_monitoring_interval": "1secs", "strict": "true", "switch_user": "true", "version": "false", "work_dir": "/tmp/mesos" }, "frameworks": [], "git_sha": "2ae1ba91e64f92ec71d327e10e6ba9e8ad5477e8", "git_tag": "0.21.1", "hostname": "10.141.141.10", "id": "20150325-175150-16842879-5050-1232-S0", "killed_tasks": 3, "log_dir": "/var/log/mesos", "lost_tasks": 0, "master_hostname": "mesos.vm", "pid": "slave(1)@127.0.1.1:5051", "resources": { "cpus": 2, "disk": 34068, "mem": 1000, "ports": "[31000-32000]" }, "staged_tasks": 6, "start_time": 1427305910.94885, "started_tasks": 0, "version": "0.21.1" } {code} Marathon apps JSON: {code:json} { "apps": [ { "id": "/test-app", "cmd": "sleep 5000", "args": null, "user": null, "env": {}, "instances": 1, "cpus": 0.1, "mem": 16, "disk": 0, "executor": "", "constraints": [], "uris": [], "storeUrls": [], "ports": [ 10000 ], "requirePorts": false, "backoffSeconds": 1, "backoffFactor": 1.15, "maxLaunchDelaySeconds": 3600, "container": null, "healthChecks": [], "dependencies": [], "upgradeStrategy": { "minimumHealthCapacity": 1, "maximumOverCapacity": 1 }, "labels": {}, "version": "2015-03-25T21:13:08.827Z", "tasksStaged": 0, "tasksRunning": 0, "tasksHealthy": 0, "tasksUnhealthy": 0, "deployments": [ { "id": "e4d9ea33-b01d-41f3-932c-cfd7326d1368" } ] } ] } {code} > Mesos 0.20.1 causes framework starvation on single node clusters when using > Chronos and Marathon > ------------------------------------------------------------------------------------------------ > > Key: MESOS-2546 > URL: https://issues.apache.org/jira/browse/MESOS-2546 > Project: Mesos > Issue Type: Bug > Components: framework > Affects Versions: 0.20.1 > Reporter: Sunil Shah > > Tracking an issue raised by Chronos users that appears to be a regression in > Mesos: https://github.com/mesos/chronos/issues/381#issuecomment-83647539 > 1) Chronos's interval between refusing offers and receiving the next one is > at 0.1 seconds to allow finer grained scheduling of jobs. > 2) On single node clusters running Mesos 0.20.1 with both Chronos and > Marathon installed, Marathon did not receive any offers. On multi-node > clusters, this behaviour was not observed. This behaviour was not observed > when using previous versions of Mesos. > 3) Changing this interval back to the default value (i.e., by not setting it) > fixed this problem. (See > [commit|https://github.com/mesos/chronos/commit/fb1ab1c42207b12c8663457d07c322fc81a8ec2e].) > This can be replicated using an installation of playa-mesos and running both > the latest Mesosphere packages of Chronos and Marathon. -- This message was sent by Atlassian JIRA (v6.3.4#6332)