Dave Voutila writes:

> vmd(8) users of tech@,
>
> NOTE: I have no intention to try to commit this prior to 6.9's release
> due to its complexity, but I didn't want to "wait" to solicit testers or
> potential feedback.

Freeze is over, so bumping this thread with an updated diff below.

>
> I noticed recently that I could not have two vmctl(8) clients "wait" for
> the same vm to shutdown as one would cancel the other. Worse yet, if you
> cancel a wait (^C) you can effectively corrupt the state being used for
> tracking the waiting client preventing future clients from waiting on
> the vm.
>
> It turns out the socket fd of the vmctl(8) client is being sent by the
> control process as the peerid in the imsg. This fd is being stored on
> the vmd_vm structure in the vm_peerid member, but this fd only has
> meaning in the scope of the control process. Consequently:
>
> - only 1 value can be stored at a time, meaning only 1 waiting client
>   can exist at a time
> - since vm_peerid is used for storing if another vmd(8) process is
>   waiting on the vm, vm_peerid can be corrupted by vmctl(8)
> - the control process cannot update waiting state on vmctl disconnects
>   and since fd's are reused it's possible the message could be sent to a
>   vmctl(8) client performing an operation other than a "wait"
>
> The below diff:
>
> 1. enables support for multiple vmctl(8) clients to wait on the same vm
>    to terminate
> 2. keeps the wait state in the control process and out of the parent's
>    global vm state, tracking waiting parties in a TAILQ
> 3. removes the waiting client state on client disconnect/cancellation
> 4. simplifies vmd(8) by removing IMSG_VMDOP_WAIT_VM_REQUEST handling
>    from the vmm process, which isn't needed (and was partially
>    responsible for the corruption)
>

Above design still stands, but I've fixed some messaging issues related
to the fact the parent process was forwarding
IMSG_VMDOP_TERMINATE_VM_RESPONSE messages directly to the control
process resulting in duplicate messages. This broke doing a `vmctl stop`
for all vms (-a) and waiting (-w). It now only forwards errors.

> There are some subsequent tweaks that may follow this diff, specifically
> one related to the fact I've switched the logic to send
> IMSG_VMDOP_TERMINATE_VM_EVENT messages to the control process (which
> makes sense to me) but control relays a IMSG_VMDOP_TERMINATE_VM_RESPONSE
> message to the waiting vmctl(8) client. I'd need to update vmctl(8) to
> look for the other event and don't want to complicate the diff further.
>
> If any testers out there can try to break this for me it would be much
> appreciated. :-)
>

Testers? I'd like to give people a few days to kick the tires before
asking for OK to commit.

-dv


Index: control.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/control.c,v
retrieving revision 1.34
diff -u -p -r1.34 control.c
--- control.c   20 Apr 2021 21:11:56 -0000      1.34
+++ control.c   21 Apr 2021 17:17:04 -0000
@@ -41,6 +41,13 @@

 struct ctl_connlist ctl_conns = TAILQ_HEAD_INITIALIZER(ctl_conns);

+struct ctl_notify {
+       int                     ctl_fd;
+       uint32_t                ctl_vmid;
+       TAILQ_ENTRY(ctl_notify) entry;
+};
+TAILQ_HEAD(ctl_notify_q, ctl_notify) ctl_notify_q =
+       TAILQ_HEAD_INITIALIZER(ctl_notify_q);
 void
         control_accept(int, short, void *);
 struct ctl_conn
@@ -78,7 +85,10 @@ int
 control_dispatch_vmd(int fd, struct privsep_proc *p, struct imsg *imsg)
 {
        struct ctl_conn         *c;
+       struct ctl_notify       *notify = NULL, *notify_next;
        struct privsep          *ps = p->p_ps;
+       struct vmop_result       vmr;
+       int                      waiting = 0;

        switch (imsg->hdr.type) {
        case IMSG_VMDOP_START_VM_RESPONSE:
@@ -86,11 +96,11 @@ control_dispatch_vmd(int fd, struct priv
        case IMSG_VMDOP_SEND_VM_RESPONSE:
        case IMSG_VMDOP_RECEIVE_VM_RESPONSE:
        case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
-       case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
        case IMSG_VMDOP_GET_INFO_VM_DATA:
        case IMSG_VMDOP_GET_INFO_VM_END_DATA:
        case IMSG_CTL_FAIL:
        case IMSG_CTL_OK:
+               /* Provide basic response back to a specific control client */
                if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) {
                        log_warnx("%s: lost control connection: fd %d",
                            __func__, imsg->hdr.peerid);
@@ -99,6 +109,61 @@ control_dispatch_vmd(int fd, struct priv
                imsg_compose_event(&c->iev, imsg->hdr.type,
                    0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg));
                break;
+       case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
+               IMSG_SIZE_CHECK(imsg, &vmr);
+               memcpy(&vmr, imsg->data, sizeof(vmr));
+
+               if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) {
+                       log_warnx("%s: lost control connection: fd %d",
+                           __func__, imsg->hdr.peerid);
+                       return (0);
+               }
+
+               TAILQ_FOREACH(notify, &ctl_notify_q, entry) {
+                       if (notify->ctl_fd == (int) imsg->hdr.peerid) {
+                               /*
+                                * Update if waiting by vm name. This is only
+                                * supported when stopping a single vm. If
+                                * stopping all vms, vmctl(8) sends the request
+                                * using the vmid.
+                                */
+                               if (notify->ctl_vmid < 1)
+                                       notify->ctl_vmid = vmr.vmr_id;
+                               waiting = 1;
+                               break;
+                       }
+               }
+
+               /* An error needs to be relayed to the client immediately */
+               if (!waiting || vmr.vmr_result) {
+                       imsg_compose_event(&c->iev, imsg->hdr.type,
+                           0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg));
+
+                       if (notify) {
+                               TAILQ_REMOVE(&ctl_notify_q, notify, entry);
+                               free(notify);
+                       }
+               }
+               break;
+       case IMSG_VMDOP_TERMINATE_VM_EVENT:
+               /* Notify any waiting clients that a VM terminated */
+               IMSG_SIZE_CHECK(imsg, &vmr);
+               memcpy(&vmr, imsg->data, sizeof(vmr));
+
+               TAILQ_FOREACH_SAFE(notify, &ctl_notify_q, entry, notify_next) {
+                       if (notify->ctl_vmid != vmr.vmr_id)
+                               continue;
+                       if ((c = control_connbyfd(notify->ctl_fd)) != NULL) {
+                               /* XXX vmctl expects *_RESPONSE, not *_EVENT */
+                               imsg_compose_event(&c->iev,
+                                   IMSG_VMDOP_TERMINATE_VM_RESPONSE,
+                                   0, 0, imsg->fd, imsg->data,
+                                   IMSG_DATA_SIZE(imsg));
+                               TAILQ_REMOVE(&ctl_notify_q, notify, entry);
+                               free(notify);
+                       }
+               }
+               break;
        case IMSG_VMDOP_CONFIG:
                config_getconfig(ps->ps_env, imsg);
                proc_compose(ps, PROC_PARENT, IMSG_VMDOP_DONE, NULL, 0);
@@ -276,7 +341,8 @@ control_connbyfd(int fd)
 void
 control_close(int fd, struct control_sock *cs)
 {
-       struct ctl_conn *c;
+       struct ctl_conn         *c;
+       struct ctl_notify       *notify, *notify_next;

        if ((c = control_connbyfd(fd)) == NULL) {
                log_warn("%s: fd %d: not found", __func__, fd);
@@ -286,6 +352,14 @@ control_close(int fd, struct control_soc
        msgbuf_clear(&c->iev.ibuf.w);
        TAILQ_REMOVE(&ctl_conns, c, entry);

+       TAILQ_FOREACH_SAFE(notify, &ctl_notify_q, entry, notify_next) {
+               if (notify->ctl_fd == fd) {
+                       TAILQ_REMOVE(&ctl_notify_q, notify, entry);
+                       free(notify);
+                       break;
+               }
+       }
+
        event_del(&c->iev.ev);
        close(c->iev.ibuf.fd);

@@ -308,7 +382,8 @@ control_dispatch_imsg(int fd, short even
        struct imsg                      imsg;
        struct vmop_create_params        vmc;
        struct vmop_id                   vid;
-       int                              n, v, ret = 0;
+       struct ctl_notify               *notify;
+       int                              n, v, wait = 0, ret = 0;

        if ((c = control_connbyfd(fd)) == NULL) {
                log_warn("%s: fd %d: not found", __func__, fd);
@@ -388,11 +463,25 @@ control_dispatch_imsg(int fd, short even
                        }
                        break;
                case IMSG_VMDOP_WAIT_VM_REQUEST:
+                       wait = 1;
+                       /* FALLTHROUGH */
                case IMSG_VMDOP_TERMINATE_VM_REQUEST:
                        if (IMSG_DATA_SIZE(&imsg) < sizeof(vid))
                                goto fail;
                        memcpy(&vid, imsg.data, sizeof(vid));
                        vid.vid_uid = c->peercred.uid;
+
+                       if (wait || vid.vid_flags & VMOP_WAIT) {
+                               vid.vid_flags |= VMOP_WAIT;
+                               notify = calloc(1, sizeof(struct ctl_notify));
+                               if (notify == NULL)
+                                       fatal("%s: calloc", __func__);
+                               notify->ctl_vmid = vid.vid_id;
+                               notify->ctl_fd = fd;
+                               TAILQ_INSERT_TAIL(&ctl_notify_q, notify, entry);
+                               log_debug("%s: registered wait for peer %d",
+                                   __func__, fd);
+                       }

                        if (proc_compose_imsg(ps, PROC_PARENT, -1,
                            imsg.hdr.type, fd, -1, &vid, sizeof(vid)) == -1) {
Index: vmd.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
retrieving revision 1.122
diff -u -p -r1.122 vmd.c
--- vmd.c       5 Apr 2021 11:35:26 -0000       1.122
+++ vmd.c       21 Apr 2021 17:17:04 -0000
@@ -128,42 +128,41 @@ vmd_dispatch_control(int fd, struct priv
                IMSG_SIZE_CHECK(imsg, &vid);
                memcpy(&vid, imsg->data, sizeof(vid));
                flags = vid.vid_flags;
+               cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;

                if ((id = vid.vid_id) == 0) {
                        /* Lookup vm (id) by name */
                        if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
                                res = ENOENT;
-                               cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
                                break;
                        } else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
                            (flags & VMOP_FORCE) == 0) {
                                res = EALREADY;
-                               cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
                                break;
                        } else if (!(vm->vm_state & VM_STATE_RUNNING)) {
                                res = EINVAL;
-                               cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
                                break;
                        }
                        id = vm->vm_vmid;
                } else if ((vm = vm_getbyvmid(id)) == NULL) {
                        res = ENOENT;
-                       cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
                        break;
                }
-               if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
-                   vid.vid_uid) != 0) {
+               if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
                        res = EPERM;
-                       cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
                        break;
                }

-               memset(&vid, 0, sizeof(vid));
-               vid.vid_id = id;
-               vid.vid_flags = flags;
-               if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
-                   imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
-                       return (-1);
+               /* Only relay TERMINATION requests, not WAIT requests */
+               if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
+                       memset(&vid, 0, sizeof(vid));
+                       vid.vid_id = id;
+                       vid.vid_flags = flags;
+
+                       if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
+                               imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
+                               return (-1);
+               }
                break;
        case IMSG_VMDOP_GET_INFO_VM_REQUEST:
                proc_forward_imsg(ps, imsg, PROC_VMM, -1);
@@ -420,12 +419,14 @@ vmd_dispatch_vmm(int fd, struct privsep_
        case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
                IMSG_SIZE_CHECK(imsg, &vmr);
                memcpy(&vmr, imsg->data, sizeof(vmr));
-               DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
-                   __func__, vmr.vmr_id);
-               proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
-               if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
-                       break;
-               if (vmr.vmr_result == 0) {
+
+               if (vmr.vmr_result) {
+                       DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
+                           __func__, vmr.vmr_id);
+                       proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
+               } else {
+                       if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
+                               break;
                        /* Mark VM as shutting down */
                        vm->vm_state |= VM_STATE_SHUTDOWN;
                }
@@ -478,16 +479,13 @@ vmd_dispatch_vmm(int fd, struct privsep_
                        config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
                }

-               /* Send a response if a control client is waiting for it */
-               if (imsg->hdr.peerid != (uint32_t)-1) {
-                       /* the error is meaningless for deferred responses */
-                       vmr.vmr_result = 0;
+               /* The error is meaningless for deferred responses */
+               vmr.vmr_result = 0;

-                       if (proc_compose_imsg(ps, PROC_CONTROL, -1,
-                           IMSG_VMDOP_TERMINATE_VM_RESPONSE,
-                           imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
-                               return (-1);
-               }
+               if (proc_compose_imsg(ps, PROC_CONTROL, -1,
+                       IMSG_VMDOP_TERMINATE_VM_EVENT,
+                       imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
+                       return (-1);
                break;
        case IMSG_VMDOP_GET_INFO_VM_DATA:
                IMSG_SIZE_CHECK(imsg, &vir);
Index: vmm.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmm.c,v
retrieving revision 1.100
diff -u -p -r1.100 vmm.c
--- vmm.c       11 Apr 2021 21:02:40 -0000      1.100
+++ vmm.c       21 Apr 2021 17:17:04 -0000
@@ -150,30 +150,6 @@ vmm_dispatch_parent(int fd, struct privs
                        res = ENOENT;
                cmd = IMSG_VMDOP_START_VM_RESPONSE;
                break;
-       case IMSG_VMDOP_WAIT_VM_REQUEST:
-               IMSG_SIZE_CHECK(imsg, &vid);
-               memcpy(&vid, imsg->data, sizeof(vid));
-               id = vid.vid_id;
-
-               DPRINTF("%s: recv'ed WAIT_VM for %d", __func__, id);
-
-               cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
-               if (id == 0) {
-                       res = ENOENT;
-               } else if ((vm = vm_getbyvmid(id)) != NULL) {
-                       if (vm->vm_peerid != (uint32_t)-1) {
-                               peerid = vm->vm_peerid;
-                               res = EINTR;
-                       } else
-                               cmd = 0;
-                       vm->vm_peerid = imsg->hdr.peerid;
-               } else {
-                       /* vm doesn't exist, cannot stop vm */
-                       log_debug("%s: cannot stop vm that is not running",
-                           __func__);
-                       res = VMD_VM_STOP_INVALID;
-               }
-               break;
        case IMSG_VMDOP_TERMINATE_VM_REQUEST:
                IMSG_SIZE_CHECK(imsg, &vid);
                memcpy(&vid, imsg->data, sizeof(vid));
@@ -221,15 +197,6 @@ vmm_dispatch_parent(int fd, struct privs
                                            __func__);
                                        res = VMD_VM_STOP_INVALID;
                                }
-                       }
-                       if ((flags & VMOP_WAIT) &&
-                           res == 0 && (vm->vm_state & VM_STATE_SHUTDOWN)) {
-                               if (vm->vm_peerid != (uint32_t)-1) {
-                                       peerid = vm->vm_peerid;
-                                       res = EINTR;
-                               } else
-                                       cmd = 0;
-                               vm->vm_peerid = imsg->hdr.peerid;
                        }
                } else {
                        /* VM doesn't exist, cannot stop vm */

Reply via email to