CVSROOT:        /cvs/cluster
Module name:    cluster
Branch:         RHEL5
Changes by:     [EMAIL PROTECTED]       2007-11-26 21:46:27

Modified files:
        rgmanager      : ChangeLog 
        rgmanager/include: reslist.h 
        rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c 
                               reslist.c resrules.c restree.c rg_state.c 
                               test.c 
        rgmanager/src/resources: service.sh vm.sh 
Added files:
        rgmanager/include: restart_counter.h 
        rgmanager/src/daemons: restart_counter.c 

Log message:
        Implement restart counters per #247139

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.28&r2=1.31.2.29
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.6&r2=1.15.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.3&r2=1.14.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.11&r2=1.11.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.12&r2=1.25.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.9&r2=1.34.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.4&r2=1.14.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.16.2.7&r2=1.16.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.12&r2=1.23.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.13&r2=1.24.2.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.6.2.5&r2=1.6.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/service.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.6&r2=1.7.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.8&r2=1.1.2.9

--- cluster/rgmanager/ChangeLog 2007/11/26 21:37:17     1.31.2.28
+++ cluster/rgmanager/ChangeLog 2007/11/26 21:46:26     1.31.2.29
@@ -1,3 +1,21 @@
+2007-11-26 Lon Hohberger <lhh at redhat.com>
+       * include/reslist.h: Add restart counters to resource node structure
+       (intended for top-level resources, i.e. services, vms...)
+       * include/restart_counter.h: Add header file for restart counter
+       * src/daemons/Makefile: Fix build to include restart counters
+       * src/daemons/restart_counter.c: Implement restart counters #247139
+       * src/daemons/fo_domain.c, groups.c, restart_counter.c, resrules.c,
+       restree.c, test.c: Glue for restart counters.
+       * src/daemons/reslist.c: Glue for restart counters.  Make expand_time
+       parser more robust to allow things like '1h30m' as a time value.
+       * src/daemons/main.c: Mark quorum disk offline in the correct
+       place to avoid extraneous log messages
+       * src/daemons/rg_state.c: Allow marking service as stopped if
+       stuck in recover state.  Make service which failed to start
+       go to stopped state.  Glue for restart counters.
+       * src/resources/service.sh, vm.sh: Add parameters for restart
+       counters #247139
+
 2007-11-14 Lon Hohberger <lhh at redhat.com>
        * src/utils/clulog.c: Make clulog honor rgmanager log levels
        (#289501)
--- cluster/rgmanager/include/reslist.h 2007/08/02 14:46:51     1.15.2.6
+++ cluster/rgmanager/include/reslist.h 2007/11/26 21:46:26     1.15.2.7
@@ -126,6 +126,7 @@
        struct _rg_node *rn_child, *rn_parent;
        resource_t      *rn_resource;
        resource_act_t  *rn_actions;
+       restart_counter_t rn_restart_counter;
        int     rn_state; /* State of this instance of rn_resource */
        int     rn_flags;
        int     rn_last_status;
--- cluster/rgmanager/src/daemons/Makefile      2007/07/24 13:53:08     1.14.2.3
+++ cluster/rgmanager/src/daemons/Makefile      2007/11/26 21:46:27     1.14.2.4
@@ -38,7 +38,8 @@
 clurgmgrd: rg_thread.o rg_locks.o main.o groups.o  \
                rg_queue.o rg_forward.o reslist.o \
                resrules.o restree.o fo_domain.o nodeevent.o \
-               rg_event.o watchdog.o rg_state.o ../clulib/libclulib.a
+               rg_event.o watchdog.o rg_state.o \
+               restart_counter.o ../clulib/libclulib.a
        $(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) $(LDFLAGS) -lccs -lcman -lpthread 
-ldlm
 
 #
@@ -56,7 +57,8 @@
 # packages should run 'make check' as part of the build process.
 #
 rg_test: rg_locks-noccs.o test-noccs.o reslist-noccs.o \
-               resrules-noccs.o restree-noccs.o fo_domain-noccs.o
+               resrules-noccs.o restree-noccs.o fo_domain-noccs.o \
+               restart_counter.o 
        $(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) -llalloc $(LDFLAGS) -lccs -lcman
 
 clurmtabd: clurmtabd.o clurmtabd_lib.o
--- cluster/rgmanager/src/daemons/fo_domain.c   2006/09/27 16:28:41     1.11
+++ cluster/rgmanager/src/daemons/fo_domain.c   2007/11/26 21:46:27     1.11.2.1
@@ -27,6 +27,7 @@
 #include <list.h>
 #include <clulog.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <ccs.h>
 #include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c      2007/08/02 14:46:51     
1.25.2.12
+++ cluster/rgmanager/src/daemons/groups.c      2007/11/26 21:46:27     
1.25.2.13
@@ -20,6 +20,7 @@
 //#define DEBUG
 #include <platform.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <vf.h>
 #include <message.h>
@@ -178,6 +179,29 @@
 }
 
 
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+       resource_t *res;
+       resource_node_t *node, *ret = NULL;
+       char rgname[64];
+       int x;
+
+       list_for(&_tree, node, x) {
+
+               res = node->rn_resource;
+               res_build_name(rgname, sizeof(rgname), res);
+
+               if (!strcasecmp(name, rgname)) {
+                       ret = node;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+
 int
 count_resource_groups_local(cman_node_t *mp)
 {
@@ -1583,6 +1607,28 @@
 }
 
 
+int
+check_restart(char *rg_name)
+{
+       resource_node_t *node;
+       int ret = 1;
+
+       pthread_rwlock_rdlock(&resource_lock);
+       node = node_by_ref(&_tree, rg_name);
+       if (node) {
+               ret = restart_add(node->rn_restart_counter);
+               if (ret) {
+                       /* Clear it out - caller is about 
+                          to relocate the service anyway */
+                       restart_clear(node->rn_restart_counter);
+               }
+       }
+       pthread_rwlock_unlock(&resource_lock);
+
+       return ret;
+}
+
+
 void
 kill_resource_groups(void)
 {
--- cluster/rgmanager/src/daemons/main.c        2007/08/21 16:39:02     1.34.2.9
+++ cluster/rgmanager/src/daemons/main.c        2007/11/26 21:46:27     
1.34.2.10
@@ -165,6 +165,7 @@
 
        old_membership = member_list();
        new_ml = get_member_list(h);
+       memb_mark_down(new_ml, 0);
 
        for (x = 0; x < new_ml->cml_count; x++) {
 
@@ -181,19 +182,25 @@
                        quorate = cman_is_listening(h,
                                        new_ml->cml_members[x].cn_nodeid,
                                        port);
+
                        if (quorate == 0) {
                                clulog(LOG_DEBUG, "Node %d is not listening\n",
                                        new_ml->cml_members[x].cn_nodeid);
                                new_ml->cml_members[x].cn_member = 0;
                        } else if (quorate < 0) {
+                               if (errno == ENOTCONN) {
+                                       new_ml->cml_members[x].cn_member = 0;
+                                       break;
+                               }
                                perror("cman_is_listening");
                                usleep(50000);
                                continue;
                        }
-
 #ifdef DEBUG
-                       printf("Node %d IS listening\n",
-                              new_ml->cml_members[x].cn_nodeid);
+                       else {
+                               printf("Node %d IS listening\n",
+                                      new_ml->cml_members[x].cn_nodeid);
+                       }
 #endif
                        break;
                } while(1);
@@ -201,7 +208,6 @@
 
        cman_finish(h);
        member_list_update(new_ml);
-       member_set_state(0, 0);         /* Mark qdisk as dead */
 
        /*
         * Handle nodes lost.  Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c     2007/07/31 17:54:54     1.14.2.4
+++ cluster/rgmanager/src/daemons/reslist.c     2007/11/26 21:46:27     1.14.2.5
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c    2007/07/31 17:54:54     1.16.2.7
+++ cluster/rgmanager/src/daemons/resrules.c    2007/11/26 21:46:27     1.16.2.8
@@ -27,6 +27,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <dirent.h>
@@ -218,43 +219,70 @@
 
 
 int
-expand_time(char *val)
+expand_time (char *val)
 {
-       int l = strlen(val);
-       char c = val[l - 1];
-       int ret = atoi(val);
+       int curval, len;
+       int ret = 0;
+       char *start = val, ival[16];
 
-       if (ret <= 0)
-               return 0;
+       if (!val)
+               return (time_t)0;
+
+       while (start[0]) {
+
+               len = 0;
+               curval = 0;
+               memset(ival, 0, sizeof(ival));
+
+               while (isdigit(start[len])) {
+                       ival[len] = start[len];
+                       len++;
+               }
+
+               if (len) {
+                       curval = atoi(ival);
+               } else {
+                       len = 1;
+               }
 
-       if ((c >= '0') && (c <= '9'))
-               return ret;
+               switch(start[len]) {
+               case 0:
+               case 'S':
+               case 's':
+                       break;
+               case 'M':
+               case 'm':
+                       curval *= 60;
+                       break;
+               case 'h':
+               case 'H':
+                       curval *= 3600;
+                       break;
+               case 'd':
+               case 'D':
+                       curval *= 86400;
+                       break;
+               case 'w':
+               case 'W':
+                       curval *= 604800;
+                       break;
+               case 'y':
+               case 'Y':
+                       curval *= 31536000;
+                       break;
+               default:
+                       curval = 0;
+               }
 
-       switch(c) {
-       case 'S':
-       case 's':
-               return (ret);
-       case 'M':
-       case 'm':
-               return (ret * 60);
-       case 'h':
-       case 'H':
-               return (ret * 3600);
-       case 'd':
-       case 'D':
-               return (ret * 86400);
-       case 'w':
-       case 'W':
-               return (ret * 604800);
-       case 'y':
-       case 'Y':
-               return (ret * 31536000);
+               ret += (time_t)curval;
+               start += len;
        }
 
        return ret;
 }
 
 
+
 /**
  * Store a resource action
  * @param actsp                Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c     2007/09/25 21:09:23     
1.23.2.12
+++ cluster/rgmanager/src/daemons/restree.c     2007/11/26 21:46:27     
1.23.2.13
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <clulog.h>
@@ -432,6 +433,39 @@
 }
 
 
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+                     resource_node_t *node)
+{
+       char *val;
+       int max_restarts = 0;
+       time_t restart_expire_time = 0;
+
+       node->rn_restart_counter = NULL;
+
+       if (!curres || !node)
+               return;
+       if (parent) /* Non-parents don't get one for now */
+               return;
+
+       val = res_attr_value(curres, "max_restarts");
+       if (!val)
+               return;
+       max_restarts = atoi(val);
+       if (max_restarts <= 0)
+               return;
+       val = res_attr_value(curres, "restart_expire_time");
+       if (val) {
+               restart_expire_time = (time_t)expand_time(val);
+               if (!restart_expire_time)
+                       return;
+       }
+
+       node->rn_restart_counter = restart_init(restart_expire_time,
+                                               max_restarts);
+}
+
+
 static inline int
 do_load_resource(int ccsfd, char *base,
                 resource_rule_t *rule,
@@ -514,6 +548,7 @@
        node->rn_state = RES_STOPPED;
        node->rn_flags = 0;
        node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+       assign_restart_policy(curres, parent, node);
 
        snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
@@ -768,6 +803,11 @@
                        destroy_resource_tree(&(*tree)->rn_child);
 
                list_remove(tree, node);
+
+               if (node->rn_restart_counter) {
+                       restart_cleanup(node->rn_restart_counter);
+               }
+
                if(node->rn_actions){
                        free(node->rn_actions);
                }
--- cluster/rgmanager/src/daemons/rg_state.c    2007/08/30 16:03:03     
1.24.2.13
+++ cluster/rgmanager/src/daemons/rg_state.c    2007/11/26 21:46:27     
1.24.2.14
@@ -1315,7 +1315,8 @@
        }
 
        if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
-            (svcStatus.rs_state != RG_STATE_ERROR)) {
+           (svcStatus.rs_state != RG_STATE_ERROR) &&
+           (svcStatus.rs_state != RG_STATE_RECOVER)) {
                rg_unlock(&lockp);
                return 0;
        }
@@ -1721,8 +1722,10 @@
         * We got sent here from handle_start_req.
         * We're DONE.
         */
-       if (request == RG_START_RECOVER)
+       if (request == RG_START_RECOVER) {
+               _svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
                return RG_EFAIL;
+       }
 
        /*
         * All potential places for the service to start have been exhausted.
@@ -1731,7 +1734,7 @@
 exhausted:
        if (!rg_locked()) {
                clulog(LOG_WARNING,
-                      "#70: Attempting to restart service %s locally.\n",
+                      "#70: Failed to relocate %s; restarting locally\n",
                       svcName);
                if (svc_start(svcName, RG_START_RECOVER) == 0) {
                        *new_owner = me;
@@ -1969,6 +1972,14 @@
                                           new_owner);
        }
 
+       /* Check restart counter/timer for this resource */
+       if (check_restart(svcName) > 0) {
+               clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+                      "attempting to relocate\n", svcName);
+               return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+                                          new_owner);
+       }
+
        return handle_start_req(svcName, RG_START_RECOVER, new_owner);
 }
 
--- cluster/rgmanager/src/daemons/test.c        2007/07/31 17:54:54     1.6.2.5
+++ cluster/rgmanager/src/daemons/test.c        2007/11/26 21:46:27     1.6.2.6
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 
--- cluster/rgmanager/src/resources/service.sh  2007/11/13 17:38:43     1.7.2.6
+++ cluster/rgmanager/src/resources/service.sh  2007/11/26 21:46:27     1.7.2.7
@@ -154,6 +154,32 @@
             </shortdesc>
             <content type="string"/>
         </parameter>
+
+        <parameter name="max_restarts">
+            <longdesc lang="en">
+               Maximum restarts for this service.
+            </longdesc>
+            <shortdesc lang="en">
+               Maximum restarts for this service.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="restart_expire_time">
+            <longdesc lang="en">
+               Restart expiration time
+            </longdesc>
+            <shortdesc lang="en">
+               Restart expiration time.  A restart is forgotten
+               after this time.  When combined with the max_restarts
+               option, this lets administrators specify a threshold
+               for when to fail over services.  If max_restarts
+               is exceeded in this given expiration time, the service
+               is relocated instead of restarted again.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
     </parameters>
 
     <actions>
--- cluster/rgmanager/src/resources/vm.sh       2007/11/14 18:58:26     1.1.2.8
+++ cluster/rgmanager/src/resources/vm.sh       2007/11/26 21:46:27     1.1.2.9
@@ -184,6 +184,31 @@
             <content type="string" default="live"/>
         </parameter>
 
+        <parameter name="max_restarts">
+            <longdesc lang="en">
+               Maximum restarts for this service.
+            </longdesc>
+            <shortdesc lang="en">
+               Maximum restarts for this service.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
+        <parameter name="restart_expire_time">
+            <longdesc lang="en">
+               Restart expiration time
+            </longdesc>
+            <shortdesc lang="en">
+               Restart expiration time.  A restart is forgotten
+               after this time.  When combined with the max_restarts
+               option, this lets administrators specify a threshold
+               for when to fail over services.  If max_restarts
+               is exceeded in this given expiration time, the service
+               is relocated instead of restarted again.
+            </shortdesc>
+            <content type="string"/>
+        </parameter>
+
     </parameters>
 
     <actions>

Reply via email to