Added inspect retries to the docker containerizer in `update` method. This patch fixes the bug when a terminal status update is never sent after termination of the docker executor, when the Docker daemon hangs for `inspect` command. A terminal status update is postponed until containerizer `update` completes that uses `inspect` command to get a PID of the docker container. To address the issue, we retry `inspect` in the loop.
Review: https://reviews.apache.org/r/65868/ Project: http://git-wip-us.apache.org/repos/asf/mesos/repo Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/76c38f9d Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/76c38f9d Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/76c38f9d Branch: refs/heads/master Commit: 76c38f9d03ee6854e6bcd00a959d697472e0ea58 Parents: 84c3b4c Author: Andrei Budnik <abud...@mesosphere.com> Authored: Fri Mar 2 15:39:09 2018 -0800 Committer: Gilbert Song <songzihao1...@gmail.com> Committed: Fri Mar 2 15:40:31 2018 -0800 ---------------------------------------------------------------------- src/slave/constants.hpp | 3 +++ src/slave/containerizer/docker.cpp | 42 ++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mesos/blob/76c38f9d/src/slave/constants.hpp ---------------------------------------------------------------------- diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp index 030fb05..b79c084 100644 --- a/src/slave/constants.hpp +++ b/src/slave/constants.hpp @@ -135,6 +135,9 @@ constexpr Duration DOCKER_REMOVE_DELAY = Hours(6); // container. constexpr Duration DOCKER_INSPECT_DELAY = Seconds(1); +// Default duration to wait for `inspect` command completion. +constexpr Duration DOCKER_INSPECT_TIMEOUT = Seconds(5); + // Default maximum number of docker inspect calls docker ps will invoke // in parallel to prevent hitting system's open file descriptor limit. constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100; http://git-wip-us.apache.org/repos/asf/mesos/blob/76c38f9d/src/slave/containerizer/docker.cpp ---------------------------------------------------------------------- diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp index 1f4eeb4..31d64a7 100644 --- a/src/slave/containerizer/docker.cpp +++ b/src/slave/containerizer/docker.cpp @@ -1711,7 +1711,47 @@ Future<Nothing> DockerContainerizerProcess::update( return __update(containerId, _resources, container->pid.get()); } - return docker->inspect(containers_.at(containerId)->containerName) + string containerName = containers_.at(containerId)->containerName; + + // Since the Docker daemon might hang, we have to retry the inspect command. + // + // NOTE: This code is duplicated from the built-in docker executor, but + // the retry interval is not passed to `inspect`, because the container might + // be terminated. + // TODO(abudnik): Consider using a class helper for retrying docker commands. + auto inspectLoop = loop( + self(), + [=]() { + return await( + docker->inspect(containerName) + .after( + slave::DOCKER_INSPECT_TIMEOUT, + [=](Future<Docker::Container> future) { + LOG(WARNING) << "Docker inspect timed out after " + << slave::DOCKER_INSPECT_TIMEOUT + << " for container " + << "'" << containerName << "'"; + + // We need to clean up the hanging Docker CLI process. + // Discarding the inspect future triggers a callback in + // the Docker library that kills the subprocess and + // transitions the future. + future.discard(); + return future; + })); + }, + [](const Future<Docker::Container>& future) + -> Future<ControlFlow<Docker::Container>> { + if (future.isReady()) { + return Break(future.get()); + } + if (future.isFailed()) { + return Failure(future.failure()); + } + return Continue(); + }); + + return inspectLoop .then(defer(self(), &Self::_update, containerId, _resources, lambda::_1)); #else return Nothing();