Author: aconway Date: Mon Aug 10 21:10:53 2009 New Revision: 802927 URL: http://svn.apache.org/viewvc?rev=802927&view=rev Log: Watchdog feature to remove unresponsive cluster nodes.
In some intstances (e.g. while resolving an error) it's possible for a hung process to hang the entire cluster as they wait for its response. The cluster can handle terminated processes but hung processes present a problem. If the watchdog plugin is loaded and --watchdog-interval is set then the broker forks a child process that runs a very simple watchdog program, and starts a timer in the broker process to signal the watchdog every interval/2 seconds. The watchdog kills its parent if it does not receive a signal for interval seconds. This allows a stuck broker to be removed from the cluster so other cluster members can continue. Added: qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp qpid/trunk/qpid/cpp/src/tests/test_watchdog (with props) Modified: qpid/trunk/.gitignore qpid/trunk/qpid/cpp/src/Makefile.am qpid/trunk/qpid/cpp/src/cluster.mk qpid/trunk/qpid/cpp/src/tests/cluster.mk Modified: qpid/trunk/.gitignore URL: http://svn.apache.org/viewvc/qpid/trunk/.gitignore?rev=802927&r1=802926&r2=802927&view=diff ============================================================================== --- qpid/trunk/.gitignore (original) +++ qpid/trunk/.gitignore Mon Aug 10 21:10:53 2009 @@ -20,7 +20,7 @@ qpid/cpp/libtool qpidc.spec qpid/cpp/src/gen/ -*.mk +*gen.mk *.timestamp rgen.timestamp *.pcl Modified: qpid/trunk/qpid/cpp/src/Makefile.am URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/Makefile.am?rev=802927&r1=802926&r2=802927&view=diff ============================================================================== --- qpid/trunk/qpid/cpp/src/Makefile.am (original) +++ qpid/trunk/qpid/cpp/src/Makefile.am Mon Aug 10 21:10:53 2009 @@ -115,6 +115,7 @@ # Destination for intalled programs and tests defined here # qpidexecdir = $(libexecdir)/qpid +AM_CXXFLAGS += -DQPID_EXEC_DIR=\"$(qpidexecdir)\" qpidexec_PROGRAMS = qpidexec_SCRIPTS = qpidtestdir = $(qpidexecdir)/tests Modified: qpid/trunk/qpid/cpp/src/cluster.mk URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/cluster.mk?rev=802927&r1=802926&r2=802927&view=diff ============================================================================== --- qpid/trunk/qpid/cpp/src/cluster.mk (original) +++ qpid/trunk/qpid/cpp/src/cluster.mk Mon Aug 10 21:10:53 2009 @@ -89,4 +89,13 @@ cluster_la_CXXFLAGS = $(AM_CXXFLAGS) -fno-strict-aliasing cluster_la_LDFLAGS = $(PLUGINLDFLAGS) +# The watchdog plugin and helper executable +dmodule_LTLIBRARIES += watchdog.la +watchdog_la_SOURCES = qpid/cluster/WatchDogPlugin.cpp +watchdog_la_LIBADD = libqpidbroker.la +watchdog_la_LDFLAGS = $(PLUGINLDFLAGS) + +qpidexec_PROGRAMS += qpidd_watchdog +qpidd_watchdog_SOURCES = qpid/cluster/qpidd_watchdog.cpp + endif # HAVE_LIBCPG Added: qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp?rev=802927&view=auto ============================================================================== --- qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp (added) +++ qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp Mon Aug 10 21:10:53 2009 @@ -0,0 +1,114 @@ +/* + * + * Copyright (c) 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "qpid/Plugin.h" +#include "qpid/Options.h" +#include "qpid/log/Statement.h" +#include "qpid/broker/Broker.h" +#include "qpid/sys/Timer.h" +#include "qpid/sys/Fork.h" +#include <sys/types.h> +#include <sys/wait.h> +#include <signal.h> + +namespace qpid { +namespace cluster { + +using broker::Broker; + +struct Settings { + Settings() : interval(0) {} + int interval; +}; + +struct WatchDogOptions : public qpid::Options { + Settings& settings; + + WatchDogOptions(Settings& s) : settings(s) { + addOptions() + ("watchdog-interval", optValue(settings.interval, "N"), + "broker is automatically killed if it is hung for more than \ + N seconds. 0 disables watchdog."); + } +}; + +struct WatchDogTask : public sys::TimerTask { + int pid; + sys::Timer& timer; + int interval; + + WatchDogTask(int pid_, sys::Timer& t, int _interval) + : TimerTask(_interval*sys::TIME_SEC/2), pid(pid_), timer(t), interval(_interval) {} + + void fire() { + timer.add (new WatchDogTask(pid, timer, interval)); + QPID_LOG(debug, "Sending keepalive signal to watchdog"); + ::kill(pid, SIGUSR1); + } +}; + +struct WatchDogPlugin : public qpid::Plugin, public qpid::sys::Fork { + Settings settings; + WatchDogOptions options; + Broker* broker; + int watchdogPid; + + WatchDogPlugin() : options(settings), broker(0), watchdogPid(0) {} + + ~WatchDogPlugin() { + if (watchdogPid) ::kill(watchdogPid, SIGTERM); + ::waitpid(watchdogPid, 0, 0); + } + + Options* getOptions() { return &options; } + + void earlyInitialize(qpid::Plugin::Target& target) { + broker = dynamic_cast<Broker*>(&target); + if (broker && settings.interval) { + QPID_LOG(notice, "Starting watchdog process with interval of " << + settings.interval << " seconds"); + fork(); + } + } + + void initialize(Target&) {} + + protected: + + void child() { // Child of fork + const char* watchdog = ::getenv("QPID_WATCHDOG_EXE"); // For use in tests + if (!watchdog) watchdog=QPID_EXEC_DIR "/qpidd_watchdog"; + std::string interval = boost::lexical_cast<std::string>(settings.interval); + ::execl(watchdog, watchdog, interval.c_str(), NULL); + QPID_LOG(critical, "Failed to exec watchdog program " << watchdog ); + ::kill(::getppid(), SIGKILL); + exit(1); + } + + void parent(int pid) { // Parent of fork + watchdogPid = pid; + broker->getTimer().add( + new WatchDogTask(watchdogPid, broker->getTimer(), settings.interval)); + // TODO aconway 2009-08-10: to be extra safe, we could monitor + // the watchdog child and re-start it if it exits. + } +}; + +static WatchDogPlugin instance; // Static initialization. + +}} // namespace qpid::cluster Added: qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp?rev=802927&view=auto ============================================================================== --- qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp (added) +++ qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp Mon Aug 10 21:10:53 2009 @@ -0,0 +1,60 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ +#include <sys/types.h> +#include <sys/time.h> +#include <signal.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <limits.h> + +long timeout; + +void killParent(int) { + ::kill(getppid(), SIGKILL); + ::fprintf(stderr, "Watchdog killed unresponsive broker, pid=%d\n", ::getppid()); + ::exit(1); +} + +void resetTimer(int) { + struct ::itimerval itval = { { 0, 0 }, { timeout, 0 } }; + if (::setitimer(ITIMER_REAL, &itval, 0) !=0) { + ::perror("Watchdog failed to set timer"); + killParent(0); + ::exit(1); + } +} + +/** Simple watchdog program: kill parent process if timeout + * expires without a SIGUSR1. + * Will be killed with SIGHUP when parent shuts down. + * Args: timeout in seconds. + */ +int main(int argc, char** argv) { + if(argc != 2 || (timeout = atoi(argv[1])) == 0) { + ::fprintf(stderr, "Usage: %s <timeout_seconds>\n", argv[0]); + ::exit(1); + } + ::signal(SIGUSR1, resetTimer); + ::signal(SIGALRM, killParent); + resetTimer(0); + while (true) { sleep(INT_MAX); } +} Modified: qpid/trunk/qpid/cpp/src/tests/cluster.mk URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/tests/cluster.mk?rev=802927&r1=802926&r2=802927&view=diff ============================================================================== --- qpid/trunk/qpid/cpp/src/tests/cluster.mk (original) +++ qpid/trunk/qpid/cpp/src/tests/cluster.mk Mon Aug 10 21:10:53 2009 @@ -29,44 +29,46 @@ # ais_check checks pre-requisites for cluster tests and runs them if ok. -TESTS += \ - ais_check \ - run_cluster_tests \ - federated_cluster_test \ +TESTS += \ + ais_check \ + test_watchdog \ + run_cluster_tests \ + federated_cluster_test \ clustered_replication_test - -EXTRA_DIST += \ - ais_check \ - start_cluster \ - stop_cluster \ - restart_cluster \ - cluster_python_tests \ - cluster_python_tests_failing.txt \ - federated_cluster_test \ - clustered_replication_test \ - run_cluster_tests \ - run_long_cluster_tests \ - testlib.py \ - cluster_tests.py \ - long_cluster_tests.py - - -LONG_TESTS += \ - run_long_cluster_tests \ - start_cluster \ - cluster_python_tests \ + +EXTRA_DIST += \ + ais_check \ + start_cluster \ + stop_cluster \ + restart_cluster \ + cluster_python_tests \ + cluster_python_tests_failing.txt \ + federated_cluster_test \ + clustered_replication_test \ + run_cluster_tests \ + run_long_cluster_tests \ + testlib.py \ + cluster_tests.py \ + long_cluster_tests.py + +LONG_TESTS += \ + run_long_cluster_tests \ + start_cluster \ + cluster_python_tests \ stop_cluster qpidtest_PROGRAMS += cluster_test -cluster_test_SOURCES = \ - cluster_test.cpp \ - unit_test.cpp \ - ClusterFixture.cpp \ - ClusterFixture.h \ - ForkedBroker.h \ - ForkedBroker.cpp \ - PartialFailure.cpp \ - ClusterFailover.cpp + +cluster_test_SOURCES = \ + cluster_test.cpp \ + unit_test.cpp \ + ClusterFixture.cpp \ + ClusterFixture.h \ + ForkedBroker.h \ + ForkedBroker.cpp \ + PartialFailure.cpp \ + ClusterFailover.cpp + cluster_test_LDADD=$(lib_client) $(lib_broker) -lboost_unit_test_framework qpidtest_SCRIPTS += run_cluster_tests cluster_tests.py run_long_cluster_tests long_cluster_tests.py testlib.py Added: qpid/trunk/qpid/cpp/src/tests/test_watchdog URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/tests/test_watchdog?rev=802927&view=auto ============================================================================== --- qpid/trunk/qpid/cpp/src/tests/test_watchdog (added) +++ qpid/trunk/qpid/cpp/src/tests/test_watchdog Mon Aug 10 21:10:53 2009 @@ -0,0 +1,16 @@ +#!/bin/sh +# Tests for the watchdog plug-in + +# Start a broker with watchdog, freeze it with kill -STOP, verify that it is killed. +export QPID_WATCHDOG_EXE=$PWD/../qpidd_watchdog +PORT=`../qpidd -dp0 --no-data-dir --auth=no --no-module-dir --load-module $PWD/../.libs/watchdog.so --log-to-file=qpidd_watchdog.log --watchdog-interval 1` +PID=`../qpidd -cp $PORT` +kill -STOP $PID +sleep 2 + +if kill -0 $PID 2>/dev/null; then + echo "Hung process did not die." + kill $PID +else + true +fi Propchange: qpid/trunk/qpid/cpp/src/tests/test_watchdog ------------------------------------------------------------------------------ svn:executable = * --------------------------------------------------------------------- Apache Qpid - AMQP Messaging Implementation Project: http://qpid.apache.org Use/Interact: mailto:commits-subscr...@qpid.apache.org