On Wed, Aug 22, 2018 at 12:12:10AM +0200, Remi Locherer wrote:
> On Tue, Aug 21, 2018 at 05:54:18PM +0100, Stuart Henderson wrote:
> > On 2018/08/21 17:16, Remi Locherer wrote:
> > > Hi tech,
> > > 
> > > recently we had a short outage in our network. A script started an 
> > > additional
> > > ospfd instance because the -n flag for config test was missing.
> > 
> > This is a problem with bgpd as well, last time I did this it killed one of 
> > the
> > *other* routers on the network (i.e. not just the one where I accidentally 
> > ran
> > 2x bgpd...).
> > 
> > > What then happend was not nice:
> > > - The new ospfd unlinked the control socket of the first ospfd
> > > - The new ospfd removed all routes from the first ospfd
> > > - The new ospfd was not able to build up an adjacency and therefore could
> > >   not install the routes needed for a recovery.
> > > - Both ospfd instances were running but non-functional.
> > > 
> > > Of course the faulty script is fixed by now. ;-)
> > > 
> > > It would be nice if ospfd could prevent such a situation.
> > > 
> > > Below diff does these things:
> > > - Detect a running ospfd by first doing a connect on the control socket.
> > > - Do not delete the control socket on exit.
> > >   - This could delete the socket of another instance.
> > >   - Unlinking the socket on shutdown will be in the way once we add pledge
> > >     to the main process. It was removed recently from various daemons.
> > 
> > This all sounds very sensible.
> > 
> > > - Do not delete routes added by another process even if they have
> > >   prio RTP_OSPF. Without this the new ospfd will remove all the routes
> > >   of the first one.
> > 
> > I'm unsure about this, the above changes stop the new ospfd from running
> > don't they, so that shouldn't be a problem?
> 
> It stops to late. kr_init happens before and kill all existing routes with
> priority 32. And again in the shutdown function of ospfd.
> > 
> > If an ospfd blows up for whatever reason, it would be quite inconvenient
> > if it needs manual route tweaks rather than just 'rcctl start ospfd' to fix 
> > it ..
> 
> Yes, this is not optimal.
> 
> The new diff below defers kr_init until the ospf engine notifies the parent
> that the control socket is ready. In case the ospf engine exits because the
> control socket is already in use no routes are known that could be removed.
> 
> With this ospfd keeps the behaviour of removing foreign routes with
> priority 32.
> 
> Better?
> 

Why are we not checking the control socket in the parent?
Also it may be better to create the control socket in the parent and pass
it to the ospfe. This is what bgpd is doing and allows to change the path
during runtime with a config reload.

Could there be a case where this causes ospfd to hang on start in the
connect? Not sure if we can sleep doing a connect() to a AF_UNIX socket.

> Index: control.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/control.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 control.c
> --- control.c 24 Jan 2017 04:24:25 -0000      1.44
> +++ control.c 17 Aug 2018 22:41:43 -0000
> @@ -42,19 +42,29 @@ int
>  control_init(char *path)
>  {
>       struct sockaddr_un       sun;
> -     int                      fd;
> +     int                      fd, fd_check;
>       mode_t                   old_umask;
>  
> +     bzero(&sun, sizeof(sun));
> +     sun.sun_family = AF_UNIX;
> +     strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
> +
> +     if ((fd_check = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
> +             log_warn("control_init: socket check");
> +             return (-1);
> +     }
> +     if (connect(fd_check, (struct sockaddr *)&sun, sizeof(sun)) == 0) {
> +             log_warnx("control_init: socket in use");
> +             return (-1);
> +     }
> +     close(fd_check);
> +
>       if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
>           0)) == -1) {
>               log_warn("control_init: socket");
>               return (-1);
>       }
>  
> -     bzero(&sun, sizeof(sun));
> -     sun.sun_family = AF_UNIX;
> -     strlcpy(sun.sun_path, path, sizeof(sun.sun_path));
> -
>       if (unlink(path) == -1)
>               if (errno != ENOENT) {
>                       log_warn("control_init: unlink %s", path);
> @@ -98,16 +108,6 @@ control_listen(void)
>       evtimer_set(&control_state.evt, control_accept, NULL);
>  
>       return (0);
> -}
> -
> -void
> -control_cleanup(char *path)
> -{
> -     if (path == NULL)
> -             return;
> -     event_del(&control_state.ev);
> -     event_del(&control_state.evt);
> -     unlink(path);
>  }
>  
>  /* ARGSUSED */
> Index: control.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/control.h,v
> retrieving revision 1.6
> diff -u -p -r1.6 control.h
> --- control.h 10 Feb 2015 05:24:48 -0000      1.6
> +++ control.h 17 Aug 2018 21:02:36 -0000
> @@ -39,6 +39,5 @@ int control_listen(void);
>  void control_accept(int, short, void *);
>  void control_dispatch_imsg(int, short, void *);
>  int  control_imsg_relay(struct imsg *);
> -void control_cleanup(char *);
>  
>  #endif       /* _CONTROL_H_ */
> Index: ospfd.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/ospfd.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 ospfd.c
> --- ospfd.c   11 Jul 2018 12:09:34 -0000      1.99
> +++ ospfd.c   21 Aug 2018 21:39:23 -0000
> @@ -270,10 +270,6 @@ main(int argc, char *argv[])
>           iev_rde->handler, iev_rde);
>       event_add(&iev_rde->ev, NULL);
>  
> -     if (kr_init(!(ospfd_conf->flags & OSPFD_FLAG_NO_FIB_UPDATE),
> -         ospfd_conf->rdomain, ospfd_conf->redist_label_or_prefix) == -1)
> -             fatalx("kr_init failed");
> -
>       /* remove unneeded stuff from config */
>       while ((a = LIST_FIRST(&ospfd_conf->area_list)) != NULL) {
>               LIST_REMOVE(a, entry);
> @@ -300,7 +296,6 @@ ospfd_shutdown(void)
>       msgbuf_clear(&iev_rde->ibuf.w);
>       close(iev_rde->ibuf.fd);
>  
> -     control_cleanup(ospfd_conf->csock);
>       while ((r = SIMPLEQ_FIRST(&ospfd_conf->redist_list)) != NULL) {
>               SIMPLEQ_REMOVE_HEAD(&ospfd_conf->redist_list, entry);
>               free(r);
> @@ -389,6 +384,13 @@ main_dispatch_ospfe(int fd, short event,
>                               kr_ifinfo(imsg.data, imsg.hdr.pid);
>                       else
>                               log_warnx("IFINFO request with wrong len");
> +                     break;
> +             case IMSG_CTLSOCK_READY:
> +                     if (kr_init(
> +                         !(ospfd_conf->flags & OSPFD_FLAG_NO_FIB_UPDATE),
> +                         ospfd_conf->rdomain,
> +                         ospfd_conf->redist_label_or_prefix) == -1)
> +                             fatalx("kr_init failed");
>                       break;
>               case IMSG_DEMOTE:
>                       if (imsg.hdr.len - IMSG_HEADER_SIZE != sizeof(dmsg))
> Index: ospfd.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/ospfd.h,v
> retrieving revision 1.101
> diff -u -p -r1.101 ospfd.h
> --- ospfd.h   25 Jun 2018 22:16:53 -0000      1.101
> +++ ospfd.h   21 Aug 2018 21:28:27 -0000
> @@ -101,6 +101,7 @@ enum imsg_type {
>       IMSG_CTL_IFINFO,
>       IMSG_CTL_END,
>       IMSG_CTL_LOG_VERBOSE,
> +     IMSG_CTLSOCK_READY,
>       IMSG_KROUTE_CHANGE,
>       IMSG_KROUTE_DELETE,
>       IMSG_IFINFO,
> Index: ospfe.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ospfd/ospfe.c,v
> retrieving revision 1.101
> diff -u -p -r1.101 ospfe.c
> --- ospfe.c   25 Jun 2018 22:16:53 -0000      1.101
> +++ ospfe.c   21 Aug 2018 21:30:37 -0000
> @@ -192,6 +192,8 @@ ospfe(struct ospfd_conf *xconf, int pipe
>       /* listen on ospfd control socket */
>       TAILQ_INIT(&ctl_conns);
>       control_listen();
> +     if (ospfe_imsg_compose_parent(IMSG_CTLSOCK_READY, 0, NULL, 0) == -1)
> +             return (-1);
>  
>       if ((pkt_ptr = calloc(1, READ_BUF_SIZE)) == NULL)
>               fatal("ospfe");
> 

-- 
:wq Claudio

Reply via email to