The attached patches implement a service watchdog based on timers and a
custom SIGRT signal (of which there are 30/32 available to use) and
removes the ping based solution.
In case a child gets stuck in a tevent loop the timer will eventually
kill it (in 30 sec. by default) and the monitor will catch the child has
terminated (via SGICHLD) and restart it. This makes the ping based
infrastructure obsolet so the monitor now stops setting it up.
In order to avoid changes to the dbus interface the ping method is still
in places for responders/providers, but simply never invoked.

Resolves:
https://fedorahosted.org/sssd/ticket/2921

-- 
Simo Sorce * Red Hat, Inc * New York
From 8820926905b9bfb188b6be6766e932be49aa3e0b Mon Sep 17 00:00:00 2001
From: Simo Sorce <s...@redhat.com>
Date: Wed, 13 Jan 2016 11:51:09 -0500
Subject: [PATCH 3/3] Monitor: Remove ping infrastructure

Now thast services use an internal watchdog we do not need pings anymore,
this will cut down the chatter and allow more flexible process management,
for example socket activation and exit-on-idle.

Resolves:
https://fedorahosted.org/sssd/ticket/2921
---
 src/config/SSSDConfig/__init__.py.in |   2 +-
 src/monitor/monitor.c                | 231 +----------------------------------
 src/monitor/monitor_iface.xml        |   2 +-
 3 files changed, 8 insertions(+), 227 deletions(-)

diff --git a/src/config/SSSDConfig/__init__.py.in b/src/config/SSSDConfig/__init__.py.in
index fe2971d993ccf2975bbddbe6c74572ab9f5d4e51..05b6f529479a0787f9fb685ac3897deabecb9e88 100644
--- a/src/config/SSSDConfig/__init__.py.in
+++ b/src/config/SSSDConfig/__init__.py.in
@@ -44,7 +44,7 @@ option_strings = {
     'debug_timestamps' : _('Include timestamps in debug logs'),
     'debug_microseconds' : _('Include microseconds in timestamps in debug logs'),
     'debug_to_files' : _('Write debug messages to logfiles'),
-    'timeout' : _('Ping timeout before restarting service'),
+    'timeout' : _('Watchdog timeout before restarting service'),
     'force_timeout' : _('Timeout between three failed ping checks and forcibly killing the service'),
     'command' : _('Command to start service'),
     'reconnection_retries' : _('Number of times to attempt connection to Data Providers'),
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index ac3af282d82d79a046fe0a9227a3cd14946ac595..6a0707f0b8f5886d50080074f4faccd10d89af80 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -55,9 +55,6 @@
 #include <keyutils.h>
 #endif
 
-/* ping time cannot be less then once every few seconds or the
- * monitor will get crazy hammering children with messages */
-#define MONITOR_DEF_PING_TIME 10
 /* terminate the child after this interval by default if it
  * doesn't shutdown on receiving SIGTERM */
 #define MONITOR_DEF_FORCE_TIME 60
@@ -117,7 +114,6 @@ struct mt_svc {
     pid_t pid;
 
     char *diag_cmd;
-    int ping_time;
     int kill_time;
 
     struct tevent_timer *kill_timer;
@@ -126,13 +122,10 @@ struct mt_svc {
 
     int restarts;
     time_t last_restart;
-    int failed_pongs;
     DBusPendingCall *pending;
 
     int debug_level;
 
-    struct tevent_timer *ping_ev;
-
     struct sss_child_ctx *child_ctx;
 };
 
@@ -183,11 +176,8 @@ static int start_service(struct mt_svc *mt_svc);
 
 static int monitor_service_init(struct sbus_connection *conn, void *data);
 
-static int service_send_ping(struct mt_svc *svc);
 static int service_signal_reset_offline(struct mt_svc *svc);
-static void ping_check(DBusPendingCall *pending, void *data);
 
-static void set_tasks_checker(struct mt_svc *srv);
 static int monitor_kill_service (struct mt_svc *svc);
 
 static int get_service_config(struct mt_ctx *ctx, const char *name,
@@ -337,7 +327,7 @@ static int svc_destructor(void *mem)
         DLIST_REMOVE(svc->mt_ctx->svc_list, svc);
     }
 
-    /* Cancel any pending pings */
+    /* Cancel any pending calls */
     if (svc->pending) {
         dbus_pending_call_cancel(svc->pending);
     }
@@ -700,67 +690,6 @@ static int monitor_dbus_init(struct mt_ctx *ctx)
     return ret;
 }
 
-static void tasks_check_handler(struct tevent_context *ev,
-                                struct tevent_timer *te,
-                                struct timeval t, void *ptr)
-{
-    struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc);
-    int ret;
-
-    ret = service_send_ping(svc);
-    switch (ret) {
-    case EOK:
-        /* all fine */
-        break;
-
-    case ENXIO:
-        DEBUG(SSSDBG_CRIT_FAILURE,
-              "Child (%s) not responding! (yet)\n", svc->name);
-        break;
-
-    default:
-        /* TODO: should we tear it down ? */
-        DEBUG(SSSDBG_CRIT_FAILURE,
-              "Sending a message to service (%s) failed!!\n", svc->name);
-        break;
-    }
-
-    if (svc->failed_pongs >= 3) {
-        /* too long since we last heard of this process */
-        DEBUG(SSSDBG_CRIT_FAILURE,
-              "Killing service [%s], not responding to pings!\n",
-               svc->name);
-        sss_log(SSS_LOG_ERR,
-                "Killing service [%s], not responding to pings!\n",
-                svc->name);
-
-        /* Kill the service. The SIGCHLD handler will restart it */
-        monitor_kill_service(svc);
-        return;
-    }
-
-    /* all fine, set up the task checker again */
-    set_tasks_checker(svc);
-}
-
-static void set_tasks_checker(struct mt_svc *svc)
-{
-    struct tevent_timer *te = NULL;
-    struct timeval tv;
-
-    gettimeofday(&tv, NULL);
-    tv.tv_sec += svc->ping_time;
-    tv.tv_usec = 0;
-    te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, tasks_check_handler, svc);
-    if (te == NULL) {
-        DEBUG(SSSDBG_FATAL_FAILURE,
-              "failed to add event, monitor offline for [%s]!\n",
-                  svc->name);
-        /* FIXME: shutdown ? */
-    }
-    svc->ping_ev = te;
-}
-
 static void monitor_restart_service(struct mt_svc *svc);
 static void mt_svc_sigkill(struct tevent_context *ev,
                            struct tevent_timer *te,
@@ -1214,29 +1143,11 @@ static int get_monitor_config(struct mt_ctx *ctx)
     return EOK;
 }
 
-static errno_t get_ping_config(struct mt_ctx *ctx, const char *path,
+static errno_t get_kill_config(struct mt_ctx *ctx, const char *path,
                                struct mt_svc *svc)
 {
     errno_t ret;
 
-    ret = confdb_get_int(ctx->cdb, path,
-                         CONFDB_DOMAIN_TIMEOUT,
-                         MONITOR_DEF_PING_TIME, &svc->ping_time);
-    if (ret != EOK) {
-        DEBUG(SSSDBG_CRIT_FAILURE,
-               "Failed to get ping timeout for '%s'\n", svc->name);
-        return ret;
-    }
-
-    /* 'timeout = 0' should be translated to the default */
-    if (svc->ping_time == 0) {
-        svc->ping_time = MONITOR_DEF_PING_TIME;
-    }
-
-    DEBUG(SSSDBG_CONF_SETTINGS,
-          "Time between service pings for [%s]: [%d]\n",
-           svc->name, svc->ping_time);
-
     ret = confdb_get_string(ctx->cdb, svc, path,
                             CONFDB_MONITOR_PRE_KILL_CMD,
                             NULL, &svc->diag_cmd);
@@ -1407,10 +1318,10 @@ static int get_service_config(struct mt_ctx *ctx, const char *name,
         }
     }
 
-    ret = get_ping_config(ctx, path, svc);
+    ret = get_kill_config(ctx, path, svc);
     if (ret != EOK) {
         DEBUG(SSSDBG_CRIT_FAILURE,
-              "Failed to get ping timeouts for %s\n", svc->name);
+              "Failed to get kill timeouts for %s\n", svc->name);
         talloc_free(svc);
         return ret;
     }
@@ -1502,10 +1413,10 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name,
         return ret;
     }
 
-    ret = get_ping_config(ctx, path, svc);
+    ret = get_kill_config(ctx, path, svc);
     if (ret != EOK) {
         DEBUG(SSSDBG_CRIT_FAILURE,
-              "Failed to get ping timeouts for %s\n", svc->name);
+              "Failed to get kill timeouts for %s\n", svc->name);
         talloc_free(svc);
         return ret;
     }
@@ -2699,134 +2610,6 @@ static int monitor_service_init(struct sbus_connection *conn, void *data)
                                     MON_SRV_PATH, mini);
 }
 
-/* service_send_ping
- * this function send a dbus ping to a service.
- * It returns EOK if all is fine or ENXIO if the connection is
- * not available (either not yet set up or teared down).
- * Returns e generic error in other cases.
- */
-static int service_send_ping(struct mt_svc *svc)
-{
-    DBusMessage *msg;
-    int ret;
-
-    if (!svc->conn) {
-        DEBUG(SSSDBG_TRACE_INTERNAL, "Service not yet initialized\n");
-        return ENXIO;
-    }
-
-    DEBUG(SSSDBG_TRACE_INTERNAL, "Pinging %s\n", svc->name);
-
-    /*
-     * Set up identity request
-     * This should be a well-known path and method
-     * for all services
-     */
-    msg = dbus_message_new_method_call(NULL,
-                                       MONITOR_PATH,
-                                       MON_CLI_IFACE,
-                                       MON_CLI_IFACE_PING);
-    if (!msg) {
-        DEBUG(SSSDBG_FATAL_FAILURE,"Out of memory?!\n");
-        talloc_zfree(svc->conn);
-        return ENOMEM;
-    }
-
-    ret = sbus_conn_send(svc->conn, msg,
-                         svc->ping_time * 1000, /* milliseconds */
-                         ping_check, svc, &svc->pending);
-    dbus_message_unref(msg);
-    return ret;
-}
-
-static void ping_check(DBusPendingCall *pending, void *data)
-{
-    struct mt_svc *svc;
-    DBusMessage *reply;
-    const char *dbus_error_name;
-    size_t len;
-    int type;
-
-    svc = talloc_get_type(data, struct mt_svc);
-    if (!svc) {
-        /* The connection probably went down before the callback fired.
-         * Not much we can do. */
-        DEBUG(SSSDBG_CRIT_FAILURE, "Invalid service pointer.\n");
-        return;
-    }
-    svc->pending = NULL;
-
-    reply = dbus_pending_call_steal_reply(pending);
-    if (!reply) {
-        /* reply should never be null. This function shouldn't be called
-         * until reply is valid or timeout has occurred. If reply is NULL
-         * here, something is seriously wrong and we should bail out.
-         */
-        DEBUG(SSSDBG_FATAL_FAILURE,
-              "A reply callback was called but no reply was received"
-                  " and no timeout occurred\n");
-
-        /* Destroy this connection */
-        sbus_disconnect(svc->conn);
-        goto done;
-    }
-
-    type = dbus_message_get_type(reply);
-    switch (type) {
-    case DBUS_MESSAGE_TYPE_METHOD_RETURN:
-        /* ok peer replied,
-         * make sure we reset the failure counter in the service structure */
-
-        DEBUG(SSSDBG_TRACE_INTERNAL,
-              "Service %s replied to ping\n", svc->name);
-
-        svc->failed_pongs = 0;
-        break;
-
-    case DBUS_MESSAGE_TYPE_ERROR:
-
-        dbus_error_name = dbus_message_get_error_name(reply);
-        if (!dbus_error_name) {
-            dbus_error_name = "<UNKNOWN>";
-        }
-
-        len = strlen(DBUS_ERROR_NO_REPLY);
-
-        /* Increase failed pong count */
-        if (strnlen(dbus_error_name, len + 1) == len
-                && strncmp(dbus_error_name, DBUS_ERROR_NO_REPLY, len) == 0) {
-            DEBUG(SSSDBG_CRIT_FAILURE,
-                  "A service PING timed out on [%s]. "
-                   "Attempt [%d]\n",
-                   svc->name, svc->failed_pongs);
-            svc->failed_pongs++;
-
-            if (debug_level & SSSDBG_TRACE_LIBS) {
-                svc_run_diag_cmd(svc);
-            }
-            break;
-        }
-
-        DEBUG(SSSDBG_FATAL_FAILURE,
-              "A service PING returned an error [%s], closing connection.\n",
-               dbus_error_name);
-        /* Falling through to default intentionally*/
-    default:
-        /*
-         * Timeout or other error occurred or something
-         * unexpected happened.
-         * It doesn't matter which, because either way we
-         * know that this connection isn't trustworthy.
-         * We'll destroy it now.
-         */
-        sbus_disconnect(svc->conn);
-    }
-
-done:
-    dbus_pending_call_unref(pending);
-    dbus_message_unref(reply);
-}
-
 static void service_startup_handler(struct tevent_context *ev,
                                     struct tevent_timer *te,
                                     struct timeval t, void *ptr);
@@ -2881,7 +2664,6 @@ static void service_startup_handler(struct tevent_context *ev,
 
         /* Parent */
         mt_svc->mt_ctx->check_children = true;
-        mt_svc->failed_pongs = 0;
 
         /* Handle process exit */
         ret = sss_child_register(mt_svc,
@@ -2899,7 +2681,6 @@ static void service_startup_handler(struct tevent_context *ev,
         }
 
         DLIST_ADD(mt_svc->mt_ctx->svc_list, mt_svc);
-        set_tasks_checker(mt_svc);
 
         return;
     }
diff --git a/src/monitor/monitor_iface.xml b/src/monitor/monitor_iface.xml
index 3d0e67f7147f4f824b6cb367564c756a21256a34..5d6b16f320477e7b8c0df1e15d0c68087019622e 100644
--- a/src/monitor/monitor_iface.xml
+++ b/src/monitor/monitor_iface.xml
@@ -15,7 +15,7 @@
 
     <interface name="org.freedesktop.sssd.service">
         <annotation value="mon_cli_iface" name="org.freedesktop.DBus.GLib.CSymbol"/>
-        <method name="ping">
+        <method name="ping"> <!-- deprecated -->
             <!-- no arguments, raw handler -->
             <annotation name="org.freedesktop.sssd.RawHandler" value="true"/>
         </method>
-- 
2.5.0

From 6cf163011b17712e74d5d404c3c7f3af8ba5f5c6 Mon Sep 17 00:00:00 2001
From: Simo Sorce <s...@redhat.com>
Date: Tue, 12 Jan 2016 20:13:28 -0500
Subject: [PATCH 2/3] Server: Enable Watchdog in all daemons

This allows the services to self monitor.

Related:
https://fedorahosted.org/sssd/ticket/2921
---
 src/util/server.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/util/server.c b/src/util/server.c
index 67a25955783c30dc03f3d6d9193c8547c625f134..cb84d8a4fc4040b9bca352d8a8f78be48504f2dd 100644
--- a/src/util/server.c
+++ b/src/util/server.c
@@ -459,6 +459,7 @@ int server_setup(const char *name, int flags,
     struct tevent_signal *tes;
     struct logrotate_ctx *lctx;
     char *locale;
+    int watchdog_interval;
 
     ret = chown_debug_file(NULL, uid, gid);
     if (ret != EOK) {
@@ -643,6 +644,16 @@ int server_setup(const char *name, int flags,
         }
     }
 
+    /* Setup the internal watchdog */
+    ret = confdb_get_int(ctx->confdb_ctx, conf_entry,
+                         CONFDB_DOMAIN_TIMEOUT,
+                         0, &watchdog_interval);
+    ret = setup_watchdog(ctx->event_ctx, watchdog_interval);
+    if (ret != EOK) {
+        DEBUG(SSSDBG_CRIT_FAILURE, "Watchdog setup failed.\n");
+        return ret;
+    }
+
     sss_log(SSS_LOG_INFO, "Starting up");
 
     DEBUG(SSSDBG_TRACE_FUNC, "CONFDB: %s\n", conf_db);
-- 
2.5.0

From 2a94645726f5fa8e1dfb4f2d33d63f2e97529515 Mon Sep 17 00:00:00 2001
From: Simo Sorce <s...@redhat.com>
Date: Tue, 12 Jan 2016 20:07:59 -0500
Subject: [PATCH 1/3] Util: Add watchdog helper

The watchdog uses a kernel timer to issue a signal to the process.
It checks if the ticker is not being reset by the main event loop, which
would indicate that the process got stuck.
At the same time it sets a tevent timer to clear the watchdog ticker, so
that the watchdog handler is kept happy.

If the watchdog detects that the timer event failed to reset the watchdog for
three times in a row then the process is killed.
Normally the monitor will detect the child terminated and will rescheduled it.

Related:
https://fedorahosted.org/sssd/ticket/2921
---
 Makefile.am              |   1 +
 src/util/util.h          |   4 ++
 src/util/util_watchdog.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 src/util/util_watchdog.c

diff --git a/Makefile.am b/Makefile.am
index a9d3f25d3775f6ac824b9f9b85dd0412417c33d3..407053b1a6dcd0255be76ae7f9252a671b965ea2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -858,6 +858,7 @@ libsss_util_la_SOURCES = \
     src/util/well_known_sids.c \
     src/util/string_utils.c \
     src/util/become_user.c \
+    src/util/util_watchdog.c \
     $(NULL)
 libsss_util_la_CFLAGS = \
     $(AM_CFLAGS) \
diff --git a/src/util/util.h b/src/util/util.h
index e1245bb0fbab742dd58522f1892c66d08a45b59c..00a06635e2449d86006b65f05a11b0a48628ca3d 100644
--- a/src/util/util.h
+++ b/src/util/util.h
@@ -698,4 +698,8 @@ int sss_unique_file(TALLOC_CTX *owner,
  */
 int sss_unique_filename(TALLOC_CTX *owner, char *path_tmpl);
 
+/* from util_watchdog.c */
+int setup_watchdog(struct tevent_context *ev, int interval);
+void teardown_watchdog(void);
+
 #endif /* __SSSD_UTIL_H__ */
diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c
new file mode 100644
index 0000000000000000000000000000000000000000..64faaaf03213319d3127fde3946124ee26c7794a
--- /dev/null
+++ b/src/util/util_watchdog.c
@@ -0,0 +1,142 @@
+/*
+   SSSD
+
+   Timer Watchdog routines
+
+   Copyright (C) Simo Sorce             2016
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "util/util.h"
+
+#define WATCHDOG_DEF_INTERVAL 10
+
+/* this is intentionally a global variable */
+struct watchdog_ctx {
+    timer_t timerid;
+    struct timeval interval;
+    struct tevent_timer *te;
+    volatile int ticks;
+} watchdog_ctx;
+
+/* the watchdog is purposefully *not* handled by the tevent
+ * signal handler as it is meant to check if the daemon is
+ * still processing the event queue itself. A stuck process
+ * may not handle the event queue at all and thus not handle
+ * signals either */
+static void watchdog_handler(int sig)
+{
+    /* if 3 ticks passed by kills itself */
+
+    if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > 3) {
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Watchdog timer overflow, killing process!\n");
+        orderly_shutdown(1);
+    }
+}
+
+static void watchdog_reset(void)
+{
+    __sync_and_and_fetch(&watchdog_ctx.ticks, 0);
+}
+
+static void watchdog_event_handler(struct tevent_context *ev,
+                                   struct tevent_timer *te,
+                                   struct timeval current_time,
+                                   void *private_data)
+{
+    /* first thing reset the watchdog ticks */
+    watchdog_reset();
+
+    /* then set a new watchodg event */
+    watchdog_ctx.te = tevent_add_timer(ev, ev,
+        tevent_timeval_current_ofs(watchdog_ctx.interval.tv_sec, 0),
+        watchdog_event_handler, NULL);
+    /* if the function fails the watchdog will kill the
+     * process soon enough, so we just warn */
+    if (!watchdog_ctx.te) {
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Failed to create a watchdog timer event!\n");
+    }
+}
+
+int setup_watchdog(struct tevent_context *ev, int interval)
+{
+    struct sigevent sev;
+    struct itimerspec its;
+    int signum = SIGRTMIN;
+    int ret;
+
+    CatchSignal(signum, watchdog_handler);
+
+    sev.sigev_notify = SIGEV_SIGNAL;
+    sev.sigev_signo = signum;
+    sev.sigev_value.sival_ptr = &watchdog_ctx.timerid;
+    errno = 0;
+    ret = timer_create(CLOCK_MONOTONIC, &sev, &watchdog_ctx.timerid);
+    if (ret == -1) {
+        ret = errno;
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Failed to create watchdog timer (%d) [%s]\n",
+              ret, strerror(ret));
+        return ret;
+    }
+
+    if (interval == 0) {
+        interval = WATCHDOG_DEF_INTERVAL;
+    }
+    watchdog_ctx.interval.tv_sec = interval;
+    watchdog_ctx.interval.tv_usec = 0;
+
+    /* Start the timer */
+    /* we give 1 second head start to the watchdog event */
+    its.it_value.tv_sec = interval + 1;
+    its.it_value.tv_nsec = 0;
+    its.it_interval.tv_sec = interval;
+    its.it_interval.tv_nsec = 0;
+    errno = 0;
+    ret = timer_settime(watchdog_ctx.timerid, 0, &its, NULL);
+    if (ret == -1) {
+        ret = errno;
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Failed to create watchdog timer (%d) [%s]\n",
+              ret, strerror(ret));
+        return ret;
+    }
+
+    /* Add the watchdog event and make it fire as fast as the timer */
+    watchdog_event_handler(ev, NULL, tevent_timeval_zero(), NULL);
+
+    return EOK;
+}
+
+void teardown_watchdog(void)
+{
+    int ret;
+
+    /* Disarm the timer */
+    errno = 0;
+    ret = timer_delete(watchdog_ctx.timerid);
+    if (ret == -1) {
+        ret = errno;
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Failed to destroy watchdog timer (%d) [%s]\n",
+             ret, strerror(ret));
+    }
+
+    /* and kill the watchdog event */
+    talloc_free(watchdog_ctx.te);
+}
+
-- 
2.5.0

_______________________________________________
sssd-devel mailing list
sssd-devel@lists.fedorahosted.org
https://lists.fedorahosted.org/admin/lists/sssd-devel@lists.fedorahosted.org

Reply via email to