Hi Guys,
I was reading about agent restarts on
http://mesos.apache.org/documentation/latest/agent-recovery/
<http://mesos.apache.org/documentation/latest/agent-recovery/>
From what I understood, If I had a task running and we restart the mesos-agent
I should not loose any task running.
This is not the case for systemctl (or with service command) from ubuntu 18.04.
Our Framework has checkpointing active...
My config:
[Unit]
Description=Mesos Agent
After=network.target
Wants=network.target
[Service]
Environment=LIBPROCESS_SSL_ENABLED=true
Environment=LIBPROCESS_SSL_SUPPORT_DOWNGRADE=false
Environment=LIBPROCESS_SSL_CIPHERS=AES128-SHA:AES256-SHA:DHE-RSA-AES128-SHA:DHE-DSS-AES128-SHA:DHE-RSA-AES256-SHA:DHE-DSS-AES256-SHA
Environment=LIBPROCESS_SSL_KEY_FILE=/etc/ssl/private/server_2048.key
Environment=LIBPROCESS_SSL_CERT_FILE=/etc/ssl/server.crt
Environment=LIBPROCESS_SSL_CA_FILE=/etc/pki/trust/anchors/it4ad.pem
ExecStart=/usr/local/sbin/mesos-agent \
--master=<zookeeper> \
--work_dir=/data/mesos/work \
--log_dir=/var/log/mesos \
--executor_registration_timeout=20mins \
--executor_environment_variables=file:///etc/mesos/executor_envs.json \
--resources=file:///etc/mesos/resources.txt \
--image_gc_config=file:///etc/mesos/image-gc-config.json \
--isolation=cgroups/cpu,cgroups/mem,cgroups/devices,filesystem/linux,gpu/nvidia,docker/runtime,namespaces/pid,namespaces/ipc
\
--image_providers=docker \
--docker_store_dir=/data/mesos/store/docker \
--gc_delay=3weeks \
--attributes=<attr>
KillMode=control-cgroup
Restart=always
RestartSec=20
LimitNOFILE=infinity
CPUAccounting=true
MemoryAccounting=true
TasksMax=infinity
[Install]
WantedBy=multi-user.target
Any tipp ? thx
Jorge Machado
www.jmachado.me