Package: release.debian.org Severity: normal User: release.debian....@packages.debian.org Usertags: unblock
Please unblock package sbd New version contains upstream fixes for some usecases and updates package tests to work with Corosync/Pacemaker versions in buster. unblock sbd/1.4.0-18-g5e3283c-1 -- System Information: Debian Release: buster/sid APT prefers unstable APT policy: (500, 'unstable') Architecture: amd64 (x86_64) Kernel: Linux 4.19.0-3-amd64 (SMP w/8 CPU cores) Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), LANGUAGE=en_US.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/bash Init: systemd (via /run/systemd/system) LSM: AppArmor: enabled
diff -Nru sbd-1.4.0/debian/changelog sbd-1.4.0-18-g5e3283c/debian/changelog --- sbd-1.4.0/debian/changelog 2019-01-15 09:25:28.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/changelog 2019-05-08 10:55:44.000000000 +0200 @@ -1,3 +1,12 @@ +sbd (1.4.0-18-g5e3283c-1) unstable; urgency=medium + + * New upstream version 1.4.0-18-g5e3283c (Closes: #925821) + * debian/sbd.lintian-overrides: update manpage line + * debian/patches: use /run for PIDFile location + * debian/tests: update for corosync v3 + + -- Valentin Vidic <vvi...@debian.org> Wed, 08 May 2019 10:55:44 +0200 + sbd (1.4.0-1) unstable; urgency=medium * New upstream version 1.4.0 diff -Nru sbd-1.4.0/debian/patches/pidfile-in-runstatedir.patch sbd-1.4.0-18-g5e3283c/debian/patches/pidfile-in-runstatedir.patch --- sbd-1.4.0/debian/patches/pidfile-in-runstatedir.patch 1970-01-01 01:00:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/patches/pidfile-in-runstatedir.patch 2019-05-08 10:55:20.000000000 +0200 @@ -0,0 +1,28 @@ +Description: Use /run for PIDFile location + systemd complains if PIDFile uses /var/run +Author: Valentin Vidic <vvi...@debian.org> +Last-Update: 2019-04-26 +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +--- a/src/sbd.service.in ++++ b/src/sbd.service.in +@@ -10,7 +10,7 @@ + + [Service] + Type=forking +-PIDFile=@localstatedir@/run/sbd.pid ++PIDFile=@runstatedir@/sbd.pid + EnvironmentFile=-@CONFIGDIR@/sbd + ExecStart=@sbindir@/sbd $SBD_OPTS -p @localstatedir@/run/sbd.pid watch + ExecStop=@bindir@/kill -TERM $MAINPID +--- a/src/sbd_remote.service.in ++++ b/src/sbd_remote.service.in +@@ -8,7 +8,7 @@ + + [Service] + Type=forking +-PIDFile=@localstatedir@/run/sbd.pid ++PIDFile=@runstatedir@/sbd.pid + EnvironmentFile=-@CONFIGDIR@/sbd + ExecStart=@sbindir@/sbd $SBD_OPTS -p @localstatedir@/run/sbd.pid watch + ExecStop=@bindir@/kill -TERM $MAINPID diff -Nru sbd-1.4.0/debian/patches/series sbd-1.4.0-18-g5e3283c/debian/patches/series --- sbd-1.4.0/debian/patches/series 1970-01-01 01:00:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/patches/series 2019-05-08 10:55:20.000000000 +0200 @@ -0,0 +1 @@ +pidfile-in-runstatedir.patch diff -Nru sbd-1.4.0/debian/sbd.lintian-overrides sbd-1.4.0-18-g5e3283c/debian/sbd.lintian-overrides --- sbd-1.4.0/debian/sbd.lintian-overrides 2019-01-15 09:12:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/sbd.lintian-overrides 2019-05-08 10:55:01.000000000 +0200 @@ -1 +1 @@ -manpage-has-errors-from-man usr/share/man/man8/sbd.8.gz 185: warning [p 1, 8.7i]: can't break line +manpage-has-errors-from-man usr/share/man/man8/sbd.8.gz 189: warning [p 1, 8.7i]: can't break line diff -Nru sbd-1.4.0/debian/tests/control sbd-1.4.0-18-g5e3283c/debian/tests/control --- sbd-1.4.0/debian/tests/control 2019-01-15 09:12:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/tests/control 2019-05-08 10:55:28.000000000 +0200 @@ -14,10 +14,10 @@ Restrictions: needs-root, allow-stderr, isolation-machine Tests: regression -Depends: @, pacemaker, crmsh +Depends: @ Restrictions: needs-root, isolation-machine, breaks-testbed Tests: fence-external -Depends: @, pacemaker, crmsh, fence-agents +Depends: @ Restrictions: needs-root, isolation-machine, breaks-testbed Tests: fence-agents diff -Nru sbd-1.4.0/debian/tests/fence-agents sbd-1.4.0-18-g5e3283c/debian/tests/fence-agents --- sbd-1.4.0/debian/tests/fence-agents 2019-01-15 09:12:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/tests/fence-agents 2019-05-08 10:55:28.000000000 +0200 @@ -14,21 +14,24 @@ LOOP=$(losetup --find --show $DISK) echo "=== create ===" +hostname node1 # must match corosync for fence to work sbd -d $LOOP create -echo "SBD_OPTS='-d $LOOP -W -W'" > /etc/default/sbd +sed -i "s|^#\\?\\(SBD_DEVICE=\\).*|\\1$LOOP|" /etc/default/sbd +sed -i "s|^\\(SBD_WATCHDOG_DEV=\\).*|\\1/dev/null|" /etc/default/sbd echo "=== cluster ===" -service corosync start -service pacemaker start -sleep 60 +apt-get --yes --quiet install pacemaker crmsh fence-agents service sbd status -crm status -echo "=== crm ===" -HOSTNAME=$(uname -n) -crm configure primitive sbd stonith:fence_sbd params devices=$LOOP plug=$HOSTNAME sbd_path=/usr/sbin/sbd +echo -n "Waiting for cluster to start... " +for x in `seq 60 -1 1`; do echo -n "$x "; sleep 1; done; echo +crm configure primitive sbd stonith:fence_sbd params devices=$LOOP plug=node1 sbd_path=/usr/sbin/sbd crm configure show +echo -n "Waiting for resource to start... " +for x in `seq 10 -1 1`; do echo -n "$x "; sleep 1; done; echo +crm status + echo "=== fence ===" /tmp/autopkgtest-reboot-prepare fenced -crm --force node fence $HOSTNAME +crm --force node fence node1 diff -Nru sbd-1.4.0/debian/tests/fence-external sbd-1.4.0-18-g5e3283c/debian/tests/fence-external --- sbd-1.4.0/debian/tests/fence-external 2019-01-15 09:12:00.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/debian/tests/fence-external 2019-05-08 10:55:28.000000000 +0200 @@ -14,20 +14,24 @@ LOOP=$(losetup --find --show $DISK) echo "=== create ===" +hostname node1 # must match corosync for fence to work sbd -d $LOOP create -echo "SBD_OPTS='-d $LOOP -W -W'" > /etc/default/sbd +sed -i "s|^#\\?\\(SBD_DEVICE=\\).*|\\1$LOOP|" /etc/default/sbd +sed -i "s|^\\(SBD_WATCHDOG_DEV=\\).*|\\1/dev/null|" /etc/default/sbd echo "=== cluster ===" -service corosync start -service pacemaker start -sleep 60 +apt-get --yes --quiet install pacemaker crmsh service sbd status -crm status -echo "=== crm ===" +echo -n "Waiting for cluster to start... " +for x in `seq 60 -1 1`; do echo -n "$x "; sleep 1; done; echo crm configure primitive sbd stonith:external/sbd params sbd_device=$LOOP crm configure show +echo -n "Waiting for resource to start... " +for x in `seq 10 -1 1`; do echo -n "$x "; sleep 1; done; echo +crm status + echo "=== fence ===" /tmp/autopkgtest-reboot-prepare fenced -crm --force node fence $(uname -n) +crm --force node fence node1 diff -Nru sbd-1.4.0/man/sbd.8.pod sbd-1.4.0-18-g5e3283c/man/sbd.8.pod --- sbd-1.4.0/man/sbd.8.pod 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/man/sbd.8.pod 2019-04-16 14:38:22.000000000 +0200 @@ -493,7 +493,7 @@ introduce an additional single point of failure then. If the SBD device is not accessible, the daemon will fail to start and -inhibit openais startup. +inhibit startup of cluster services. =item Two devices diff -Nru sbd-1.4.0/src/sbd-cluster.c sbd-1.4.0-18-g5e3283c/src/sbd-cluster.c --- sbd-1.4.0/src/sbd-cluster.c 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd-cluster.c 2019-04-16 14:38:22.000000000 +0200 @@ -174,6 +174,25 @@ return TRUE; } +static void +cmap_destroy(void) +{ + if (cmap_source) { + g_source_destroy(cmap_source); + cmap_source = NULL; + } + + if (track_handle) { + cmap_track_delete(cmap_handle, track_handle); + track_handle = 0; + } + + if (cmap_handle) { + cmap_finalize(cmap_handle); + cmap_handle = 0; + } +} + static gboolean sbd_get_two_node(void) { @@ -217,18 +236,7 @@ return TRUE; out: - if (cmap_source) { - g_source_destroy(cmap_source); - cmap_source = NULL; - } - if (track_handle) { - cmap_track_delete(cmap_handle, track_handle); - track_handle = 0; - } - if (cmap_handle) { - cmap_finalize(cmap_handle); - cmap_handle = 0; - } + cmap_destroy(); return FALSE; } @@ -327,6 +335,12 @@ { cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type())); + if (get_cluster_type() != pcmk_cluster_unknown) { +#if SUPPORT_COROSYNC && CHECK_TWO_NODE + cmap_destroy(); +#endif + } + set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated"); notify_parent(); diff -Nru sbd-1.4.0/src/sbd-common.c sbd-1.4.0-18-g5e3283c/src/sbd-common.c --- sbd-1.4.0/src/sbd-common.c 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd-common.c 2019-04-16 14:38:22.000000000 +0200 @@ -568,13 +568,13 @@ #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) -static unsigned char +static void sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { - return HOG_CHAR; + return; } if (inbuf == NULL) { @@ -584,10 +584,10 @@ } if (kbytes > 0) { - return sbd_stack_hogger(buf, kbytes-1); - } else { - return buf[sizeof(buf)-1]; + sbd_stack_hogger(buf, kbytes-1); } + + return; } static void diff -Nru sbd-1.4.0/src/sbd.h sbd-1.4.0-18-g5e3283c/src/sbd.h --- sbd-1.4.0/src/sbd.h 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd.h 2019-04-16 14:38:22.000000000 +0200 @@ -54,10 +54,13 @@ /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ /* exit status for disk-servant */ -#define EXIT_MD_IO_FAIL 20 -#define EXIT_MD_REQUEST_RESET 21 -#define EXIT_MD_REQUEST_SHUTOFF 22 -#define EXIT_MD_REQUEST_CRASHDUMP 23 +#define EXIT_MD_SERVANT_IO_FAIL 20 +#define EXIT_MD_SERVANT_REQUEST_RESET 21 +#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 +#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 + +/* exit status for pcmk-servant */ +#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 #define HOG_CHAR 0xff #define SECTOR_NAME_MAX 63 @@ -175,7 +178,7 @@ int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); -int servant(const char *diskname, int mode, const void* argp); +int servant_md(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); diff -Nru sbd-1.4.0/src/sbd-inquisitor.c sbd-1.4.0-18-g5e3283c/src/sbd-inquisitor.c --- sbd-1.4.0/src/sbd-inquisitor.c 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd-inquisitor.c 2019-04-16 14:38:22.000000000 +0200 @@ -42,19 +42,36 @@ struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { - cl_log(LOG_DEBUG, "Servant %s already exists", devname); - return; + cl_log(LOG_DEBUG, "Servant %s already exists", devname); + return; } newbie = malloc(sizeof(*newbie)); - if (!newbie) { - fprintf(stderr, "malloc failed in recruit_servant.\n"); - exit(1); + if (newbie) { + memset(newbie, 0, sizeof(*newbie)); + newbie->devname = strdup(devname); + newbie->pid = pid; + newbie->first_start = 1; + } + if (!newbie || !newbie->devname) { + fprintf(stderr, "heap allocation failed in recruit_servant.\n"); + exit(1); + } + + /* some sanity-check on our newbie */ + if (sbd_is_disk(newbie)) { + cl_log(LOG_INFO, "Monitoring %s", devname); + disk_count++; + } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { + /* alive just after pcmk and cluster servants have shown up */ + newbie->outdated = 1; + } else { + /* toss our newbie */ + cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); + free((void *) newbie->devname); + free(newbie); + return; } - memset(newbie, 0, sizeof(*newbie)); - newbie->devname = strdup(devname); - newbie->pid = pid; - newbie->first_start = 1; if (!s) { servants_leader = newbie; @@ -65,12 +82,6 @@ } servant_count++; - if(sbd_is_disk(newbie)) { - cl_log(LOG_INFO, "Monitoring %s", devname); - disk_count++; - } else { - newbie->outdated = 1; - } } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) @@ -148,7 +159,7 @@ if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); - s->pid = assign_servant(s->devname, servant, start_mode, s); + s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; @@ -479,19 +490,19 @@ if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { - case EXIT_MD_IO_FAIL: + case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; - case EXIT_MD_REQUEST_RESET: + case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; - case EXIT_MD_REQUEST_SHUTOFF: + case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; - case EXIT_MD_REQUEST_CRASHDUMP: + case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; @@ -499,6 +510,22 @@ break; } } + } else if (sbd_is_pcmk(s)) { + if (WIFEXITED(status)) { + switch(WEXITSTATUS(status)) { + case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: + DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); + /* revert to state prior to pacemaker-detection */ + s->restarts = 0; + s->restart_blocked = 0; + cluster_appeared = 0; + s->outdated = 1; + s->t_last.tv_sec = 0; + break; + default: + break; + } + } } cleanup_servant_by_pid(pid); } @@ -753,54 +780,56 @@ int parse_device_line(const char *line) { - int lpc = 0; - int last = 0; - int max = 0; + size_t lpc = 0; + size_t last = 0; + size_t max = 0; int found = 0; + bool skip_space = true; + int space_run = 0; - if(line) { - max = strlen(line); + if (!line) { + return 0; } - if (max <= 0) { - return found; - } + max = strlen(line); - cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", max, line); - /* Skip initial whitespace */ - for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { - last = lpc + 1; - } + cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); - /* Now the actual content */ for (lpc = 0; lpc <= max; lpc++) { - int a_space = isspace(line[lpc]); - - if (a_space && lpc < max && isspace(line[lpc + 1])) { - /* fast-forward to the end of the spaces */ - - } else if (a_space || line[lpc] == ';' || line[lpc] == 0) { - int rc = 1; - char *entry = NULL; + if (isspace(line[lpc])) { + if (skip_space) { + last = lpc + 1; + } else { + space_run++; + } + continue; + } + skip_space = false; + if (line[lpc] == ';' || line[lpc] == 0) { + int rc = 0; + char *entry = calloc(1, 1 + lpc - last); - if (lpc > last) { - entry = calloc(1, 1 + lpc - last); + if (entry) { rc = sscanf(line + last, "%[^;]", entry); + } else { + fprintf(stderr, "Heap allocation failed parsing device-line.\n"); + exit(1); } - if (entry == NULL) { - /* Skip */ - } else if (rc != 1) { - cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last); + if (rc != 1) { + cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { + entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); recruit_servant(entry, 0); found++; } free(entry); + skip_space = true; last = lpc + 1; } + space_run = 0; } return found; } @@ -861,7 +890,7 @@ int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); - exit_status = -2; + exit_status = -2; goto out; } #else @@ -1059,7 +1088,8 @@ break; case 'h': usage(); - return (0); + goto out; + break; default: exit_status = -2; goto out; @@ -1212,6 +1242,9 @@ } out: + if (timeout_action) { + free(timeout_action); + } if (exit_status < 0) { if (exit_status == -2) { usage(); diff -Nru sbd-1.4.0/src/sbd-md.c sbd-1.4.0-18-g5e3283c/src/sbd-md.c --- sbd-1.4.0/src/sbd-md.c 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd-md.c 2019-04-16 14:38:22.000000000 +0200 @@ -162,9 +162,9 @@ memset(&st->io, 0, sizeof(struct iocb)); if (rw) { - io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector); + io_prep_pwrite(&st->io, st->devfd, data, sector_size, (long long) sector_size * sector); } else { - io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector); + io_prep_pread(&st->io, st->devfd, data, sector_size, (long long) sector_size * sector); } if (io_submit(st->ioctx, 1, ios) != 1) { @@ -373,7 +373,6 @@ struct sector_header_s *s_header; struct sector_node_s *s_node; struct sector_mbox_s *s_mbox; - struct stat s; char uuid[37]; int i; int rc = 0; @@ -394,10 +393,6 @@ uuid_generate(s_header->uuid); uuid_unparse_lower(s_header->uuid, uuid); - fstat(st->devfd, &s); - /* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n", - s.st_size, s.st_blksize, s.st_blocks); */ - cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", s_header->version, s_header->minor_version, st->devfd, uuid); @@ -1031,7 +1026,7 @@ return 0; } -int servant(const char *diskname, int mode, const void* argp) +int servant_md(const char *diskname, int mode, const void* argp) { struct sector_mbox_s *s_mbox = NULL; struct sector_node_s *s_node = NULL; @@ -1046,11 +1041,6 @@ char uuid[37]; const struct servants_list_item *s = argp; - if (!diskname) { - cl_log(LOG_ERR, "Empty disk name %s.", diskname); - return -1; - } - cl_log(LOG_INFO, "Servant starting for device %s", diskname); /* Block most of the signals */ @@ -1066,19 +1056,19 @@ st = open_device(diskname, LOG_WARNING); if (!st) { - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } s_header = header_get(st); if (!s_header) { cl_log(LOG_ERR, "Not a valid header on %s", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } if (servant_check_timeout_inconsistent(s_header) < 0) { cl_log(LOG_ERR, "Timeouts on %s do not match first device", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } if (s_header->minor_version > 0) { @@ -1091,14 +1081,14 @@ cl_log(LOG_ERR, "No slot allocated, and automatic allocation failed for disk %s.", diskname); - rc = EXIT_MD_IO_FAIL; + rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } s_node = sector_alloc(); if (slot_read(st, mbox, s_node) < 0) { cl_log(LOG_ERR, "Unable to read node entry on %s", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname); @@ -1114,7 +1104,7 @@ if (mode > 0) { if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed during start-up in servant."); - rc = EXIT_MD_IO_FAIL; + rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd != SBD_MSG_EXIT && @@ -1130,7 +1120,7 @@ DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); memset(s_mbox, 0, sizeof(*s_mbox)); if (mbox_write(st, mbox, s_mbox) < 0) { - rc = EXIT_MD_IO_FAIL; + rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } } @@ -1159,28 +1149,28 @@ s_header_retry = header_get(st); if (!s_header_retry) { cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } free(s_header_retry); s_node_retry = sector_alloc(); if (slot_read(st, mbox, s_node_retry) < 0) { cl_log(LOG_ERR, "slot read failed in servant."); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } free(s_node_retry); if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed in servant."); - exit(EXIT_MD_IO_FAIL); + exit(EXIT_MD_SERVANT_IO_FAIL); } if (s_mbox->cmd > 0) { @@ -1195,14 +1185,14 @@ sigqueue(ppid, SIG_TEST, signal_value); break; case SBD_MSG_RESET: - exit(EXIT_MD_REQUEST_RESET); + exit(EXIT_MD_SERVANT_REQUEST_RESET); case SBD_MSG_OFF: - exit(EXIT_MD_REQUEST_SHUTOFF); + exit(EXIT_MD_SERVANT_REQUEST_SHUTOFF); case SBD_MSG_EXIT: sigqueue(ppid, SIG_EXITREQ, signal_value); break; case SBD_MSG_CRASHDUMP: - exit(EXIT_MD_REQUEST_CRASHDUMP); + exit(EXIT_MD_SERVANT_REQUEST_CRASHDUMP); default: /* FIXME: An "unknown" message might result diff -Nru sbd-1.4.0/src/sbd-pacemaker.c sbd-1.4.0-18-g5e3283c/src/sbd-pacemaker.c --- sbd-1.4.0/src/sbd-pacemaker.c 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd-pacemaker.c 2019-04-16 14:38:22.000000000 +0200 @@ -103,6 +103,9 @@ static long last_refresh = 0; +static int pcmk_clean_shutdown = 0; +static int pcmk_shutdown = 0; + static gboolean mon_timer_reconnect(gpointer data) { @@ -128,10 +131,26 @@ { if (cib) { cib->cmds->signoff(cib); + /* retrigger as last one might have been skipped */ + mon_refresh_state(NULL); + if (pcmk_clean_shutdown) { + /* assume a graceful pacemaker-shutdown */ + clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); + } + /* getting here we aren't sure about the pacemaker-state + so try to use the timeout to reconnect and get + everything sorted out again + */ + pcmk_shutdown = 0; set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; + /* no sense in looking into outdated cib, trying to apply patch, ... */ + if (current_cib) { + free_xml(current_cib); + current_cib = NULL; + } return; } @@ -171,7 +190,7 @@ mon_timer_notify(gpointer data) { static int counter = 0; - int counter_max = timeout_watchdog / timeout_loop; + int counter_max = timeout_watchdog / timeout_loop / 2; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); @@ -257,7 +276,7 @@ static int updates = 0; static int ever_had_quorum = FALSE; - node_t *node = pe_find_node(data_set->nodes, local_uname); + node_t *node = NULL; updates++; @@ -267,11 +286,15 @@ return; } + node = pe_find_node(data_set->nodes, local_uname); - if (node == NULL) { + if ((node == NULL) || (node->details == NULL)) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); + notify_parent(); + return; + } - } else if (node->details->online == FALSE) { + if (node->details->online == FALSE) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { @@ -280,11 +303,6 @@ } else if (node->details->pending) { set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); -#if 0 - } else if (node->details->shutdown) { - set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down"); -#endif - } else if (data_set->flags & pe_flag_have_quorum) { set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; @@ -315,6 +333,12 @@ } } + if (node->details->shutdown) { + pcmk_shutdown = 1; + } + if (pcmk_shutdown && !(node->details->running_rsc)) { + pcmk_clean_shutdown = 1; + } notify_parent(); return; } @@ -339,7 +363,7 @@ static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { - refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL); + refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } @@ -369,9 +393,9 @@ } /* Refresh - * - immediately if the last update was more than 5s ago + * - immediately if the last update was more than 1s ago * - every 10 updates - * - at most 2s after the last update + * - at most 1s after the last update */ if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) { mon_refresh_state(refresh_timer); diff -Nru sbd-1.4.0/src/sbd.sysconfig sbd-1.4.0-18-g5e3283c/src/sbd.sysconfig --- sbd-1.4.0/src/sbd.sysconfig 2019-01-14 14:27:27.000000000 +0100 +++ sbd-1.4.0-18-g5e3283c/src/sbd.sysconfig 2019-04-16 14:38:22.000000000 +0200 @@ -68,6 +68,9 @@ # If your sbd device(s) reside on a multipath setup or iSCSI, this # should be the time required to detect a path failure. # +# Be aware that watchdog timeout set in the on-disk metadata takes +# precedence. +# SBD_WATCHDOG_TIMEOUT=5 ## Type: string