On 9/30/19 11:32 AM, Richard W.M. Jones wrote:
This adds new APIs for running a local NBD server and connecting to it
using systemd socket activation (instead of stdin/stdout).

This includes interop tests against nbdkit and qemu-nbd which I
believe are the only NBD servers supporting socket activation.  (If we
find others then we can add more interop tests in future.)

The upstream spec for systemd socket activation is here:
http://0pointer.de/blog/projects/socket-activation.html
---

+++ b/generator/states-connect-socket-activation.c

+/* This is baked into the systemd socket activation API. */
+#define FIRST_SOCKET_ACTIVATION_FD 3
+
+/* Prepare environment for calling execvpe when doing systemd socket
+ * activation.  Takes the current environment and copies it.  Removes
+ * any existing LISTEN_PID or LISTEN_FDS and replaces them with new
+ * variables.  env[0] is "LISTEN_PID=..." which is filled in by
+ * CONNECT_SA.START, and env[1] is "LISTEN_FDS=1".
+ */

I know that getenv()/setenv()/putenv() tend to prefer sorted environ, but I also think that exec HAS to handle a hand-built environ that is not sorted, so you should be okay with this shortcut.

+static char **
+prepare_socket_activation_environment (void)
+{
+  char **env = NULL;
+  char *p0 = NULL, *p1 = NULL;
+  size_t i, len;
+  void *vp;
+
+  p0 = strdup ("LISTEN_PID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+  if (p0 == NULL)
+    goto err;
+  p1 = strdup ("LISTEN_FDS=1");
+  if (p1 == NULL)
+    goto err;
+
+  /* Copy the current environment. */
+  env = nbd_internal_copy_string_list (environ);

POSIX says the external symbol 'environ' has to exist for linking purposes, but also states that no standard header is required to declare it. You may want to add an 'extern char **environ;' line before this function for portability. On the other hand, gnulib documents that on newer Mac OS, even that doesn't work, where the solution is '#define environ (*_NSGetEnviron())'. I guess we'll deal with it when somebody actually reports compilation failure.

+
+  /* Remove any existing LISTEN_PID or LISTEN_FDS instances. */
+  for (i = 2; env[i] != NULL; ++i) {
+    if (strncmp (env[i], "LISTEN_PID=", 11) == 0 ||
+        strncmp (env[i], "LISTEN_FDS=", 11) == 0) {
+      memmove (&env[i], &env[i+1],
+               sizeof (char *) * (nbd_internal_string_list_length (&env[i])));
+      i--;
+    }
+  }

Lots of O(N) traversals of the list, but this probably isn't our hot spot, and so probably not worth optimizing further.


+STATE_MACHINE {
+ CONNECT_SA.START:
+#ifdef HAVE_EXECVPE
+  size_t len;
+  int s;
+  struct sockaddr_un addr;
+  char **env;
+  pid_t pid;
+  int flags;
+
+  assert (!h->sock);
+  assert (h->argv);
+  assert (h->argv[0]);
+
+  /* Use /tmp instead of TMPDIR because we must ensure the path is
+   * short enough to store in the sockaddr_un.  On some platforms this
+   * may cause problems so we may need to revisit it.  XXX
+   */
+  h->sa_tmpdir = strdup ("/tmp/libnbdXXXXXX");
+  if (h->sa_tmpdir == NULL) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "strdup");
+    return 0;
+  }
+  if (mkdtemp (h->sa_tmpdir) == NULL) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "mkdtemp");
+    /* Avoid cleanup in nbd_close. */
+    free (h->sa_tmpdir);
+    h->sa_tmpdir = NULL;
+    return 0;
+  }
+
+  h->sa_sockpath = strdup ("/tmp/libnbdXXXXXX/sock");
+  if (h->sa_sockpath == NULL) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "strdup");
+    return 0;
+  }
+
+  len = strlen (h->sa_tmpdir);
+  memcpy (h->sa_sockpath, h->sa_tmpdir, len);

Is it worth using:

asprintf (&h->sa_sockpath, "%s/sock", h->sa_tmpdir);

for less code? asprintf might not be standard, but we already require execvpe, which probably means asprintf is available. But your open-coded variant works, too.

+
+  s = socket (AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+  if (s == -1) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "socket");
+    return 0;
+  }

I guess the child process can add O_NONBLOCK if they want it.

+
+  addr.sun_family = AF_UNIX;
+  memcpy (addr.sun_path, h->sa_sockpath, strlen (h->sa_sockpath) + 1);
+  if (bind (s, (struct sockaddr *) &addr, sizeof addr) == -1) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "bind: %s", h->sa_sockpath);
+    close (s);
+    return 0;
+  }
+
+  if (listen (s, 1) == -1) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "listen");
+    close (s);
+    return 0;
+  }
+
+  env = prepare_socket_activation_environment ();
+  if (!env) {
+    SET_NEXT_STATE (%.DEAD);
+    close (s);
+    return 0;
+  }
+
+  pid = fork ();
+  if (pid == -1) {
+    SET_NEXT_STATE (%.DEAD);
+    set_error (errno, "fork");
+    close (s);
+    nbd_internal_free_string_list (env);
+    return 0;
+  }
+  if (pid == 0) {         /* child - run command */
+    if (s != FIRST_SOCKET_ACTIVATION_FD) {
+      dup2 (s, FIRST_SOCKET_ACTIVATION_FD);
+      close (s);
+    }
+    else {
+      /* We must unset CLOEXEC on the fd.  (dup2 above does this
+       * implicitly because CLOEXEC is set on the fd, not on the
+       * socket).
+       */
+      flags = fcntl (s, F_GETFD, 0);
+      if (flags == -1) {
+        nbd_internal_fork_safe_perror ("fcntl: F_GETFD");
+        _exit (126);
+      }
+      if (fcntl (s, F_SETFD, flags & ~FD_CLOEXEC) == -1) {
+        nbd_internal_fork_safe_perror ("fcntl: F_SETFD");
+        _exit (126);
+      }
+    }
+

Looks correct.

+    char buf[32];
+    const char *v =
+      nbd_internal_fork_safe_itoa ((long) getpid (), buf, sizeof buf);
+    strcpy (&env[0][11], v);

We're using the magic '11' in several places, maybe it's worth a #define to make it obvious it is strlen("LISTEN_PID=") ?

+
+    /* Restore SIGPIPE back to SIG_DFL. */
+    signal (SIGPIPE, SIG_DFL);
+
+    execvpe (h->argv[0], h->argv, env);
+    nbd_internal_fork_safe_perror (h->argv[0]);
+    if (errno == ENOENT)
+      _exit (127);
+    else
+      _exit (126);
+  }
+
+  /* Parent. */
+  close (s);
+  nbd_internal_free_string_list (env);
+  h->pid = pid;
+
+  h->connaddrlen = sizeof addr;
+  memcpy (&h->connaddr, &addr, h->connaddrlen);
+  SET_NEXT_STATE (%^CONNECT.START);
+  return 0;
+
+#else /* !HAVE_EXECVPE */
+  SET_NEXT_STATE (%.DEAD)
+  set_error (ENOTSUP, "platform does not support socket activation");
+  return 0;
+#endif

We probably ought to add a matching nbd_supports_socket_activation() feature function.

Or, it would be possible to create a fallback for execvpe() on platforms that lack it by using execlpe() and our own path-walker utility function. Can be done as a followup patch. If we do that, then the mere presence of LIBNBD_HAVE_NBD_CONNECT_SA is witness enough of the functionality, rather than needing a runtime probe.


+++ b/lib/connect.c

+
+int
+nbd_unlocked_aio_connect_socket_activation (struct nbd_handle *h, char **argv)
+{
+  char **copy;
+
+  copy = nbd_internal_copy_string_list (argv);
+  if (!copy) {
+    set_error (errno, "copy_string_list");
+    return -1;
+  }
+
+  if (h->argv)
+    nbd_internal_free_string_list (h->argv);

How can h->argv ever be previously set?

+  h->argv = copy;
+
+  return nbd_internal_run (h, cmd_connect_sa);
+}
diff --git a/lib/handle.c b/lib/handle.c
index 2af25fe..a7f2c79 100644
--- a/lib/handle.c
+++ b/lib/handle.c
@@ -129,6 +129,16 @@ nbd_close (struct nbd_handle *h)
    free_cmd_list (h->cmds_in_flight);
    free_cmd_list (h->cmds_done);
    nbd_internal_free_string_list (h->argv);
+  if (h->sa_sockpath) {
+    if (h->pid > 0)
+      kill (h->pid, SIGTERM);
+    unlink (h->sa_sockpath);
+    free (h->sa_sockpath);
+  }
+  if (h->sa_tmpdir) {
+    rmdir (h->sa_tmpdir);
+    free (h->sa_tmpdir);
+  }
    free (h->unixsocket);
    free (h->hostname);
    free (h->port);

Somewhat pre-existing: we have a waitpid() here (good, so we don't hang on to a zombie process), but we are relying on the child process to gracefully go away (whether for connect_command when stdin closes, or for connect_sa on receipt of SIGTERM). Do we need a retry loop that escalates to SIGKILL if the child process does not quickly respond to the initial condition? On the other hand, the fact that our waitpid() blocks until the child changes status means that if a child ever wedges, the fact that we wedge too gives some visibility to the client that it's not libnbd's fault and that they need to get the bug fixed in their child process.

I think it is ready to push. We may still need further tweaks, but that's often the case.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

_______________________________________________
Libguestfs mailing list
[email protected]
https://www.redhat.com/mailman/listinfo/libguestfs

Reply via email to