Hi tech,

recently we had a short outage in our network. A script started an additional
ospfd instance because the -n flag for config test was missing.

What then happend was not nice:
- The new ospfd unlinked the control socket of the first ospfd
- The new ospfd removed all routes from the first ospfd
- The new ospfd was not able to build up an adjacency and therefore could
  not install the routes needed for a recovery.
- Both ospfd instances were running but non-functional.

Of course the faulty script is fixed by now. ;-)

It would be nice if ospfd could prevent such a situation.

Below diff does these things:
- Detect a running ospfd by first doing a connect on the control socket.
- Do not delete the control socket on exit.
  - This could delete the socket of another instance.
  - Unlinking the socket on shutdown will be in the way once we add pledge
    to the main process. It was removed recently from various daemons.
- Do not delete routes added by another process even if they have
  prio RTP_OSPF. Without this the new ospfd will remove all the routes
  of the first one.

A side effect of this is that alien OSPF routes are now only logged but
not removed anymore. Should a crashed ospfd leave some routes behind the
next ospfd does not clean them up anymore. The admin would need to check
the logs and remove them manually with the route command.

Does this make sense?

Comments? OK?

Remi


Index: control.c
===================================================================
RCS file: /cvs/src/usr.sbin/ospfd/control.c,v
retrieving revision 1.44
diff -u -p -r1.44 control.c
--- control.c   24 Jan 2017 04:24:25 -0000      1.44
+++ control.c   17 Aug 2018 22:41:43 -0000
@@ -42,19 +42,29 @@ int
 control_init(char *path)
 {
        struct sockaddr_un       sun;
-       int                      fd;
+       int                      fd, fd_check;
        mode_t                   old_umask;
 
+       bzero(&sun, sizeof(sun));
+       sun.sun_family = AF_UNIX;
+       strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
+
+       if ((fd_check = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+               log_warn("control_init: socket check");
+               return (-1);
+       }
+       if (connect(fd_check, (struct sockaddr *)&sun, sizeof(sun)) == 0) {
+               log_warnx("control_init: socket in use");
+               return (-1);
+       }
+       close(fd_check);
+
        if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
            0)) == -1) {
                log_warn("control_init: socket");
                return (-1);
        }
 
-       bzero(&sun, sizeof(sun));
-       sun.sun_family = AF_UNIX;
-       strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
-
        if (unlink(path) == -1)
                if (errno != ENOENT) {
                        log_warn("control_init: unlink %s", path);
@@ -98,16 +108,6 @@ control_listen(void)
        evtimer_set(&control_state.evt, control_accept, NULL);
 
        return (0);
-}
-
-void
-control_cleanup(char *path)
-{
-       if (path == NULL)
-               return;
-       event_del(&control_state.ev);
-       event_del(&control_state.evt);
-       unlink(path);
 }
 
 /* ARGSUSED */
Index: control.h
===================================================================
RCS file: /cvs/src/usr.sbin/ospfd/control.h,v
retrieving revision 1.6
diff -u -p -r1.6 control.h
--- control.h   10 Feb 2015 05:24:48 -0000      1.6
+++ control.h   17 Aug 2018 21:02:36 -0000
@@ -39,6 +39,5 @@ int   control_listen(void);
 void   control_accept(int, short, void *);
 void   control_dispatch_imsg(int, short, void *);
 int    control_imsg_relay(struct imsg *);
-void   control_cleanup(char *);
 
 #endif /* _CONTROL_H_ */
Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/ospfd/kroute.c,v
retrieving revision 1.111
diff -u -p -r1.111 kroute.c
--- kroute.c    10 Jul 2018 11:49:04 -0000      1.111
+++ kroute.c    21 Aug 2018 14:13:27 -0000
@@ -263,6 +263,7 @@ kr_change_fib(struct kroute_node *kr, st
                kn->r.nexthop.s_addr = kroute[i].nexthop.s_addr;
                kn->r.flags = kroute[i].flags | F_OSPFD_INSERTED;
                kn->r.priority = RTP_OSPF;
+               kn->r.pid = kr_state.pid;
                kn->r.ext_tag = kroute[i].ext_tag;
                rtlabel_unref(kn->r.rtlabel);   /* for RTM_CHANGE */
                kn->r.rtlabel = kroute[i].rtlabel;
@@ -365,7 +366,7 @@ kr_fib_decouple(void)
                return;
 
        RB_FOREACH(kr, kroute_tree, &krt)
-               if (kr->r.priority == RTP_OSPF)
+               if (kr->r.priority == RTP_OSPF && kr->r.pid == kr_state.pid)
                        for (kn = kr; kn != NULL; kn = kn->next)
                                send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r);
 
@@ -1365,7 +1366,7 @@ rtmsg_process(char *buf, size_t len)
        u_int8_t                 prefixlen, prio;
        int                      flags, mpath;
        u_short                  ifindex = 0;
-       int                      rv, delay;
+       int                      delay;
 
        size_t                   offset;
        char                    *next;
@@ -1529,14 +1530,15 @@ add:
                                kr->r.ifindex = ifindex;
                                kr->r.priority = prio;
 
+                               if (rtm->rtm_pid == kr_state.pid)
+                                       kr->r.pid = 0;
+                               else
+                                       kr->r.pid = rtm->rtm_pid;
+
                                if (rtm->rtm_priority == RTP_OSPF) {
-                                       log_warnx("alien OSPF route %s/%d",
+                                       log_warnx("alien OSPF route %s/%d "
+                                           "- ignoring it",
                                            inet_ntoa(prefix), prefixlen);
-                                       rv = send_rtmsg(kr_state.fd,
-                                           RTM_DELETE, &kr->r);
-                                       free(kr);
-                                       if (rv == -1)
-                                               return (-1);
                                } else {
                                        if ((label = (struct sockaddr_rtlabel *)
                                            rti_info[RTAX_LABEL]) != NULL) {
Index: ospfd.c
===================================================================
RCS file: /cvs/src/usr.sbin/ospfd/ospfd.c,v
retrieving revision 1.99
diff -u -p -r1.99 ospfd.c
--- ospfd.c     11 Jul 2018 12:09:34 -0000      1.99
+++ ospfd.c     17 Aug 2018 21:02:20 -0000
@@ -300,7 +300,6 @@ ospfd_shutdown(void)
        msgbuf_clear(&iev_rde->ibuf.w);
        close(iev_rde->ibuf.fd);
 
-       control_cleanup(ospfd_conf->csock);
        while ((r = SIMPLEQ_FIRST(&ospfd_conf->redist_list)) != NULL) {
                SIMPLEQ_REMOVE_HEAD(&ospfd_conf->redist_list, entry);
                free(r);
Index: ospfd.h
===================================================================
RCS file: /cvs/src/usr.sbin/ospfd/ospfd.h,v
retrieving revision 1.101
diff -u -p -r1.101 ospfd.h
--- ospfd.h     25 Jun 2018 22:16:53 -0000      1.101
+++ ospfd.h     17 Aug 2018 22:10:59 -0000
@@ -416,6 +416,7 @@ struct kroute {
        u_short         ifindex;
        u_int8_t        prefixlen;
        u_int8_t        priority;
+       pid_t           pid;
 };
 
 struct kif_addr {


Reply via email to