The current NBD connection logic has the following issues: 1.It first tries netlink (forking a child), then falls back to ioctl (forking another), causing redundant process overhead and double-opening of erofs_nbd_source on fallback. 2.Child processes fail silently, hiding the error cause from the parent and confusing users. 3.erofsmount_startnbd() doesn't ignore SIGPIPE, causing nbd_loopfn to be killed abruptly without clean up during disconnect. 4.During disconnect, -EPIPE from NBD socket I/O is expected, but erofsmount_nbd_loopfn() does not suppress it, leading to uncessary "NBD worker failed with EPIPE" message printed in erofsmount_startnbd().
This patch consolidates the netlink and ioctl fallback logic into a single child process, eliminating redundant erofs_nbd_source opens. It also ensure SIGPIPE and -EPIPE are properly suppressed during disconnect in erofsmount_nbd_loopfn(), enabling cleanup and graceful exit. Additionally, the child process now reports error code via exit() for better user visibility. Signed-off-by: Yifan Zhao <[email protected]> --- diff against v1: - fix incorrectly configured commit author name and email Also, I believe corresponding tests are needed to safeguard this code change and I am working on them. lib/backends/nbd.c | 1 + mount/main.c | 346 +++++++++++++++++++++++++-------------------- 2 files changed, 194 insertions(+), 153 deletions(-) diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c index da27334..46e75cd 100644 --- a/lib/backends/nbd.c +++ b/lib/backends/nbd.c @@ -108,6 +108,7 @@ int erofs_nbd_devscan(void) errno = 0; dp = readdir(_dir); if (!dp) { + erofs_err("no available nbd device found in /sys/block"); if (errno) err = -errno; else diff --git a/mount/main.c b/mount/main.c index 02a7962..2a21979 100644 --- a/mount/main.c +++ b/mount/main.c @@ -610,6 +610,32 @@ struct erofsmount_nbd_ctx { struct erofs_vfile sk; /* socket file */ }; +static int erofsmount_open_nbd_source(struct erofs_vfile *vd, + struct erofs_nbd_source *source) +{ + int err; + + if (source->type == EROFSNBD_SOURCE_OCI) { + if (source->ocicfg.tarindex_path || source->ocicfg.zinfo_path) { + err = erofsmount_tarindex_open(vd, &source->ocicfg, + source->ocicfg.tarindex_path, + source->ocicfg.zinfo_path); + if (err) + return err; + } else { + err = ocierofs_io_open(vd, &source->ocicfg); + if (err) + return err; + } + } else { + err = open(source->device_path, O_RDONLY); + if (err < 0) + return -errno; + vd->fd = err; + } + return 0; +} + static void *erofsmount_nbd_loopfn(void *arg) { struct erofsmount_nbd_ctx *ctx = arg; @@ -621,11 +647,8 @@ static void *erofsmount_nbd_loopfn(void *arg) off_t pos; err = erofs_nbd_get_request(ctx->sk.fd, &rq); - if (err < 0) { - if (err == -EPIPE) - err = 0; + if (err < 0) break; - } if (rq.type != EROFS_NBD_CMD_READ) { err = erofs_nbd_send_reply_header(ctx->sk.fd, @@ -653,64 +676,11 @@ static void *erofsmount_nbd_loopfn(void *arg) out: erofs_io_close(&ctx->vd); erofs_io_close(&ctx->sk); + if (err == -EPIPE) + err = 0; return (void *)(uintptr_t)err; } -static int erofsmount_startnbd(int nbdfd, struct erofs_nbd_source *source) -{ - struct erofsmount_nbd_ctx ctx = {}; - uintptr_t retcode; - pthread_t th; - int err, err2; - - if (source->type == EROFSNBD_SOURCE_OCI) { - if (source->ocicfg.tarindex_path || source->ocicfg.zinfo_path) { - err = erofsmount_tarindex_open(&ctx.vd, &source->ocicfg, - source->ocicfg.tarindex_path, - source->ocicfg.zinfo_path); - if (err) - goto out_closefd; - } else { - err = ocierofs_io_open(&ctx.vd, &source->ocicfg); - if (err) - goto out_closefd; - } - } else { - err = open(source->device_path, O_RDONLY); - if (err < 0) { - err = -errno; - goto out_closefd; - } - ctx.vd.fd = err; - } - - err = erofs_nbd_connect(nbdfd, 9, EROFSMOUNT_NBD_DISK_SIZE); - if (err < 0) { - erofs_io_close(&ctx.vd); - goto out_closefd; - } - ctx.sk.fd = err; - - err = -pthread_create(&th, NULL, erofsmount_nbd_loopfn, &ctx); - if (err) { - erofs_io_close(&ctx.vd); - erofs_io_close(&ctx.sk); - goto out_closefd; - } - - err = erofs_nbd_do_it(nbdfd); - err2 = -pthread_join(th, (void **)&retcode); - if (!err2 && retcode) { - erofs_err("NBD worker failed with %s", - erofs_strerror(retcode)); - err2 = retcode; - } - return err ?: err2; -out_closefd: - close(nbdfd); - return err; -} - #ifdef OCIEROFS_ENABLED static int erofsmount_write_recovery_oci(FILE *f, struct erofs_nbd_source *source) { @@ -1013,77 +983,136 @@ static int erofsmount_nbd_fix_backend_linkage(int num, char **recp) return 0; } -static int erofsmount_startnbd_nl(pid_t *pid, struct erofs_nbd_source *source) +struct erofsmount_nbd_msg { + int nbdnum; + bool is_netlink; +}; + +static void erofsmount_startnbd_ioctl(struct erofsmount_nbd_ctx *ctx, struct erofsmount_nbd_msg *msg, int pipefd) { - int pipefd[2], err, num; + uintptr_t retcode; + pthread_t th; + char nbddev[32]; + int err = 0, err2 = 0, nbdfd; - err = pipe(pipefd); - if (err < 0) - return -errno; + msg->nbdnum = erofs_nbd_devscan(); + if (msg->nbdnum < 0) { + close(pipefd); + err = msg->nbdnum; + goto err_exit; + } - if ((*pid = fork()) == 0) { - struct erofsmount_nbd_ctx ctx = {}; - char *recp; - - /* Otherwise, NBD disconnect sends SIGPIPE, skipping cleanup */ - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) - exit(EXIT_FAILURE); - - if (source->type == EROFSNBD_SOURCE_OCI) { - if (source->ocicfg.tarindex_path || source->ocicfg.zinfo_path) { - err = erofsmount_tarindex_open(&ctx.vd, &source->ocicfg, - source->ocicfg.tarindex_path, - source->ocicfg.zinfo_path); - if (err) - exit(EXIT_FAILURE); - } else { - err = ocierofs_io_open(&ctx.vd, &source->ocicfg); - if (err) - exit(EXIT_FAILURE); - } - } else { - err = open(source->device_path, O_RDONLY); - if (err < 0) - exit(EXIT_FAILURE); - ctx.vd.fd = err; - } + (void)snprintf(nbddev, sizeof(nbddev), "/dev/nbd%d", msg->nbdnum); + nbdfd = open(nbddev, O_RDWR); + if (nbdfd < 0) { + close(pipefd); + err = -errno; + goto err_exit; + } + + err = erofs_nbd_connect(nbdfd, 9, EROFSMOUNT_NBD_DISK_SIZE); + if (err < 0) { + close(pipefd); + goto err_nbdfd; + } + ctx->sk.fd = err; + + /* Send device number to parent */ + err = write(pipefd, msg, sizeof(*msg)); + close(pipefd); + if (err != sizeof(*msg)) { + err = err < 0 ? -errno : -EIO; + goto err_nbdfd; + } + + err = -pthread_create(&th, NULL, erofsmount_nbd_loopfn, ctx); + if (err) { + erofs_io_close(&ctx->sk); + goto err_nbdfd; + } + + err = erofs_nbd_do_it(nbdfd); + err2 = -pthread_join(th, (void **)&retcode); + if (!err2 && retcode) { + erofs_err("NBD worker failed with %s", + erofs_strerror(retcode)); + err2 = retcode; + } + close(nbdfd); + exit(-(err ?: err2)); + +err_nbdfd: + close(nbdfd); +err_exit: + erofs_io_close(&ctx->vd); + exit(-err); +} + +/* Try to start NBD server with netlink first; fall back to ioctl if failed */ +static void erofsmount_startnbd(struct erofs_nbd_source *source, int pipefd) +{ + struct erofsmount_nbd_ctx ctx = {}; + int err; + char *recp; + struct erofsmount_nbd_msg msg = { .is_netlink = false }; + + /* Otherwise, NBD disconnect sends SIGPIPE, skipping cleanup */ + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + close(pipefd); + exit(errno); + } + + err = erofsmount_open_nbd_source(&ctx.vd, source); + if (err) { + close(pipefd); + exit(-err); + } + + { + /* Try netlink-based NBD first */ recp = erofsmount_write_recovery_info(source); - if (IS_ERR(recp)) { - erofs_io_close(&ctx.vd); - exit(EXIT_FAILURE); + if (IS_ERR(recp)) + goto fallback_ioctl; + + msg.nbdnum = -1; + err = erofs_nbd_nl_connect(&msg.nbdnum, 9, EROFSMOUNT_NBD_DISK_SIZE, recp); + if (err < 0) + goto err_recp; + + ctx.sk.fd = err; + err = erofsmount_nbd_fix_backend_linkage(msg.nbdnum, &recp); + if (err) { + erofs_io_close(&ctx.sk); + goto err_recp; } - num = -1; - err = erofs_nbd_nl_connect(&num, 9, EROFSMOUNT_NBD_DISK_SIZE, recp); - if (err >= 0) { - ctx.sk.fd = err; - err = erofsmount_nbd_fix_backend_linkage(num, &recp); - if (err) { - erofs_io_close(&ctx.sk); - } else { - err = write(pipefd[1], &num, sizeof(int)); - if (err < 0) - err = -errno; - close(pipefd[1]); - close(pipefd[0]); - if (err >= sizeof(int)) { - err = (int)(uintptr_t)erofsmount_nbd_loopfn(&ctx); - goto out_fork; - } - } + /* Succeed in starting netlink-based NBD */ + msg.is_netlink = true; + + /* Send device number to parent */ + err = write(pipefd, &msg, sizeof(msg)); + close(pipefd); + if (err != sizeof(msg)) { + /* will not fall back if encounter pipe write error */ + err = err < 0 ? -errno : -EIO; + erofs_io_close(&ctx.sk); + goto err_recp; } - erofs_io_close(&ctx.vd); -out_fork: - (void)unlink(recp); - free(recp); - exit(err ? EXIT_FAILURE : EXIT_SUCCESS); + + err = (int)(uintptr_t)erofsmount_nbd_loopfn(&ctx); } - close(pipefd[1]); - err = read(pipefd[0], &num, sizeof(int)); - close(pipefd[0]); - if (err < sizeof(int)) - return -EPIPE; - return num; + +err_recp: + (void)unlink(recp); + free(recp); + +fallback_ioctl: + if (!msg.is_netlink) { + erofs_info("Fall back to ioctl-based NBD; failover is unsupported"); + erofsmount_startnbd_ioctl(&ctx, &msg, pipefd); + /* never returns */ + } + exit(-err); } static int erofsmount_reattach(const char *target) @@ -1192,10 +1221,11 @@ static int erofsmount_nbd(struct erofs_nbd_source *source, const char *mountpoint, const char *fstype, int flags, const char *options) { - bool is_netlink = false; char nbdpath[32], *id; - int num, nbdfd; - pid_t pid = 0; + struct erofsmount_nbd_msg msg; + int pipefd[2]; + pid_t child; + int child_status; long err; if (strcmp(fstype, "erofs")) { @@ -1205,59 +1235,69 @@ static int erofsmount_nbd(struct erofs_nbd_source *source, } flags |= MS_RDONLY; - err = erofsmount_startnbd_nl(&pid, source); - if (err < 0) { - erofs_info("Fall back to ioctl-based NBD; failover is unsupported"); - num = erofs_nbd_devscan(); - if (num < 0) - return num; + err = pipe(pipefd); + if (err < 0) + return -errno; - (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); - nbdfd = open(nbdpath, O_RDWR); - if (nbdfd < 0) - return -errno; + if ((child = fork()) == 0) { + close(pipefd[0]); + erofsmount_startnbd(source, pipefd[1]); + /* Never returns */ + } - if ((pid = fork()) == 0) - return erofsmount_startnbd(nbdfd, source) ? - EXIT_FAILURE : EXIT_SUCCESS; - close(nbdfd); - } else { - num = err; - (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); - is_netlink = true; + close(pipefd[1]); + err = read(pipefd[0], &msg, sizeof(msg)); + close(pipefd[0]); + + if (err != sizeof(msg)) { + int retries = 5; + + while (retries-- > 0) { + if (waitpid(child, &child_status, WNOHANG) > 0) { + /* child exited, return its exit status */ + return WIFEXITED(child_status) ? -WEXITSTATUS(child_status) : -EIO; + } + usleep(10000); /* Wait 10ms */ + } + + /* child still running, kill it before returning */ + kill(child, SIGTERM); + waitpid(child, &child_status, 0); + return err < 0 ? -errno : -EIO; } + (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", msg.nbdnum); + while (1) { - err = erofs_nbd_in_service(num); + err = erofs_nbd_in_service(msg.nbdnum); if (err == -ENOENT || err == -ENOTCONN) { - int status; - - err = waitpid(pid, &status, WNOHANG); + err = waitpid(child, &child_status, WNOHANG); if (err < 0) return -errno; else if (err > 0) - return status ? -EIO : 0; + return WIFEXITED(child_status) ? -WEXITSTATUS(child_status) : -EIO; usleep(50000); continue; } if (err >= 0) - err = (err != pid ? -EBUSY : 0); + err = (err != child ? -EBUSY : 0); break; } + if (!err) { err = mount(nbdpath, mountpoint, fstype, flags, options); if (err < 0) err = -errno; - if (!err && is_netlink) { - id = erofs_nbd_get_identifier(num); + if (!err && msg.is_netlink) { + id = erofs_nbd_get_identifier(msg.nbdnum); err = IS_ERR(id) ? PTR_ERR(id) : - erofs_nbd_nl_reconfigure(num, id, true); + erofs_nbd_nl_reconfigure(msg.nbdnum, id, true); if (err) erofs_warn("failed to turn on autoclear for nbd%d: %s", - num, erofs_strerror(err)); + msg.nbdnum, erofs_strerror(err)); if (!IS_ERR(id)) free(id); } -- 2.43.0
