On Sat, 10/22 01:40, Max Reitz wrote: > On 30.09.2016 14:09, Fam Zheng wrote: > > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid > > the intervene. > > s/the intervene/a conflict/?
OK. > > > > > Both file and host device protocols are covered. > > > > The complication is with reopen. We have three different locking states, > > namely "unlocked", "shared locked" and "exclusively locked". > > > > When we reopen, the new fd may need a new locking mode. Moving away to or > > from > > exclusive is a bit tricky because we cannot do it atomically. This patch > > solves > > it by dup() s->fd to s->lock_fd and avoid close(), so that there isn't a > > racy > > window where we drop the lock on one fd before acquiring the exclusive lock > > on > > the other. > > > > To make the logic easier to manage, and allow better reuse, the code is > > internally organized by state transition table (old_lock -> new_lock). > > > > Signed-off-by: Fam Zheng <f...@redhat.com> > > --- > > block/raw-posix.c | 318 > > +++++++++++++++++++++++++++++++++++++++++++++++++++++- > > 1 file changed, 317 insertions(+), 1 deletion(-) > > > > diff --git a/block/raw-posix.c b/block/raw-posix.c > > index 6ed7547..22de242 100644 > > --- a/block/raw-posix.c > > +++ b/block/raw-posix.c > > @@ -133,6 +133,7 @@ do { \ > > > > typedef struct BDRVRawState { > > int fd; > > + int lock_fd; > > I think it would be a good idea to put a comment about the semantics of > this lock_fd here. Will do. > > > int type; > > int open_flags; > > size_t buf_align; > > @@ -149,6 +150,7 @@ typedef struct BDRVRawState { > > > > typedef struct BDRVRawReopenState { > > int fd; > > + int lock_fd; > > int open_flags; > > } BDRVRawReopenState; > > > > @@ -367,6 +369,43 @@ static void raw_parse_flags(int bdrv_flags, int > > *open_flags) > > } > > } > > > > +static int raw_lock_fd(int fd, ImageLockMode mode) > > +{ > > + assert(fd >= 0); > > + /* Locking byte 1 avoids interfereing with virtlockd. */ > > + switch (mode) { > > + case IMAGE_LOCK_MODE_EXCLUSIVE: > > + return qemu_lock_fd(fd, 1, 1, true); > > + case IMAGE_LOCK_MODE_SHARED: > > + return qemu_lock_fd(fd, 1, 1, false); > > + case IMAGE_LOCK_MODE_NOLOCK: > > + return qemu_unlock_fd(fd, 1, 1); > > + default: > > + abort(); > > + } > > +} > > + > > +static int raw_lockf(BlockDriverState *bs, ImageLockMode mode) > > +{ > > + BDRVRawState *s = bs->opaque; > > + > > + if (s->lock_fd < 0) { > > + if (mode == IMAGE_LOCK_MODE_NOLOCK) { > > + return 0; > > + } > > + s->lock_fd = qemu_dup(s->fd); > > + if (s->lock_fd < 0) { > > + return s->lock_fd; > > You should probably return -errno instead (qemu_dup apparently doesn't > return that but just -1). Yes. > > (I'm getting further and further from a high-level review, am I not?) > > > + } > > + } > > + if (mode == IMAGE_LOCK_MODE_AUTO) { > > + mode = bdrv_get_flags(bs) & BDRV_O_RDWR ? > > + IMAGE_LOCK_MODE_EXCLUSIVE : > > + IMAGE_LOCK_MODE_SHARED; > > + } > > + return raw_lock_fd(s->lock_fd, mode); > > Without a comment for how lock_fd is supposed to work, this (for > example) looks a bit weird. After all, the user is trying to lock the > fd, and I don't find it immediately clear that lock_fd always points to > the same file as fd, and you're using it only for locking (for reasons > the comment should explain as well). > > > +} > > + > > #ifdef CONFIG_LINUX_AIO > > static bool raw_use_aio(int bdrv_flags) > > { > > @@ -433,6 +472,7 @@ static int raw_open_common(BlockDriverState *bs, QDict > > *options, > > raw_parse_flags(bdrv_flags, &s->open_flags); > > > > s->fd = -1; > > + s->lock_fd = -1; > > fd = qemu_open(filename, s->open_flags, 0644); > > if (fd < 0) { > > ret = -errno; > > @@ -529,6 +569,268 @@ static int raw_open(BlockDriverState *bs, QDict > > *options, int flags, > > return raw_open_common(bs, options, flags, 0, errp); > > } > > > > +typedef enum { > > + RAW_REOPEN_PREPARE, > > + RAW_REOPEN_COMMIT, > > + RAW_REOPEN_ABORT > > +} RawReopenOperation; > > + > > +typedef int (*RawReopenFunc)(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp); > > + > > +static int > > +raw_reopen_identical(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp) > > +{ > > + assert(old_lock == new_lock); > > + return 0; > > +} > > + > > +static int > > +raw_reopen_from_unlock(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp) > > +{ > > + BDRVRawReopenState *raw_s = state->opaque; > > + int ret = 0; > > + > > + assert(old_lock != new_lock); > > + assert(old_lock == IMAGE_LOCK_MODE_NOLOCK); > > + switch (op) { > > + case RAW_REOPEN_PREPARE: > > + ret = raw_lock_fd(raw_s->lock_fd, new_lock); > > + if (ret) { > > + error_setg_errno(errp, -ret, "Failed to lock new fd %d", > > raw_s->lock_fd); > > + } > > + break; > > + case RAW_REOPEN_COMMIT: > > + case RAW_REOPEN_ABORT: > > + break; > > + } > > + > > + return ret; > > +} > > + > > +static int > > +raw_reopen_to_unlock(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp) > > +{ > > + BDRVRawState *s = state->bs->opaque; > > + int ret = 0; > > + > > + assert(old_lock != new_lock); > > + assert(new_lock == IMAGE_LOCK_MODE_NOLOCK); > > + switch (op) { > > + case RAW_REOPEN_PREPARE: > > + break; > > + case RAW_REOPEN_COMMIT: > > + if (s->lock_fd >= 0) { > > + qemu_close(s->lock_fd); > > + s->lock_fd = -1; > > + } > > + break; > > + case RAW_REOPEN_ABORT: > > + break; > > + } > > + > > + return ret; > > +} > > + > > +static int > > +raw_reopen_upgrade(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp) > > +{ > > + BDRVRawReopenState *raw_s = state->opaque; > > + BDRVRawState *s = state->bs->opaque; > > + int ret = 0, ret2; > > + > > + assert(old_lock == IMAGE_LOCK_MODE_SHARED); > > + assert(new_lock == IMAGE_LOCK_MODE_EXCLUSIVE); > > + switch (op) { > > + case RAW_REOPEN_PREPARE: > > + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED); [1] > > + if (ret) { > > + error_setg_errno(errp, -ret, "Failed to lock new fd (shared)"); > > + break; > > + } > > + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_NOLOCK); [2] > > + if (ret) { > > + error_setg_errno(errp, -ret, "Failed to unlock old fd"); > > + goto restore; > > + } > > + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_EXCLUSIVE); [3] > > + if (ret) { > > + error_setg_errno(errp, -ret, "Failed to lock new fd > > (exclusive)"); > > + goto restore; > > + } > > + break; > > + case RAW_REOPEN_COMMIT: > > + break; > > + case RAW_REOPEN_ABORT: > > + raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED); > > + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED); [4] > > + if (ret) { > > + error_report("Failed to restore lock on old fd"); > > If we get here, s->lock_fd is still locked exclusively. The following is > a very personal opinion, but anyway: I think it would be be better for > it to be unlocked. If it's locked too strictly, this can really break > something; but if it's just not locked (while it should be locked in > shared mode), everything's going to be fine unless the user makes a > mistake. I think the latter is less bad. There are four lock states when we land on this "abort" branch: a) Lock "prepare" was not run, some other error happened earlier, so the lock aren't changed compared to before the transaction starts: raw_s->lock_fd is unlocked, s->lock_fd is shared locked. In this case raw_lock_fd [4] cannot fail, and even if it does, s->lock_fd is in the correct state. b) The raw_lock_fd [1] failed. This is similar to 1), s->lock_fd is intact, so we are good. c) The raw_lock_fd [2] failed. Again, similar to above. d) The raw_lock_fd [3] failed. Here raw_s->lock_fd is shared locked, and s->lock_fd is unlocked. The correct "abort" sequence is downgrade raw_s->lock_fd and "shared lock" s->lock_fd again. If the "abort" sequence failed, s->lock_fd is unlocked. > > And we can always achieve unlocking the FD by closing s->lock_fd, so I > think that is what we should do here. So I think this already does what you want. > > > + } > > + break; > > + } > > + > > + return ret; > > +restore: > > + ret2 = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED); > > + if (ret2) { > > + error_report("Failed to restore old lock"); > > + } > > + return ret; > > +} > > + > > +static int > > +raw_reopen_downgrade(BDRVReopenState *state, > > + RawReopenOperation op, > > + ImageLockMode old_lock, > > + ImageLockMode new_lock, > > + Error **errp) > > +{ > > + BDRVRawReopenState *raw_s = state->opaque; > > + BDRVRawState *s = state->bs->opaque; > > + int ret = 0; > > + > > + assert(old_lock == IMAGE_LOCK_MODE_EXCLUSIVE); > > + assert(new_lock == IMAGE_LOCK_MODE_SHARED); > > + switch (op) { > > + case RAW_REOPEN_PREPARE: > > + break; > > + case RAW_REOPEN_COMMIT: > > + ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED); > > + if (ret) { > > + error_report("Failed to downgrade old lock"); > > + break; > > + } > > + ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED); > > + if (ret) { > > + error_report("Failed to lock new fd (shared)"); > > + break; > > + } > > + break; > > It's really unfortunate that we cannot do anything in prepare and have > to hope for the best in commit. But I can't come up with anything better. > > And if something does fail, the FD will be unlocked after reopening. > That's good enough for me. > > > + case RAW_REOPEN_ABORT: > > + break; > > + } > > + > > + return ret; > > +} > > + > > +/** > > + * Transactionally moving between three possible locking states is tricky > > and > > + * must be done carefully. That is mostly because downgrading an exclusive > > lock > > + * to shared or unlocked is not guaranteed to be revertable. As a result, > > in > > + * such cases we have to defer the downgraing to "commit", given that no > > revert > > + * will happen after that point, and that downgrading a lock should never > > fail. > > + * > > + * On the other hand, upgrading a lock (e.g. from unlocked or shared to > > + * exclusive lock) must happen in "prepare" because it may fail. > > + * > > + * Manage the operation matrix with this state transition table to make > > + * fulfulling above conditions easier. > > + */ > > +static const struct RawReopenFuncRecord { > > + ImageLockMode old_lock; > > + ImageLockMode new_lock; > > + RawReopenFunc func; > > + bool need_lock_fd; > > +} reopen_functions[] = { > > + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_identical, > > false}, > > + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_SHARED, > > raw_reopen_from_unlock, true}, > > + {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_EXCLUSIVE, > > raw_reopen_from_unlock, true}, > > + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_to_unlock, > > false}, > > + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_SHARED, raw_reopen_identical, > > false}, > > + {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_EXCLUSIVE, > > raw_reopen_upgrade, true}, > > + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_NOLOCK, > > raw_reopen_to_unlock, false}, > > + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_SHARED, > > raw_reopen_downgrade, true}, > > + {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_EXCLUSIVE, > > raw_reopen_identical, false}, > > +}; > > + > > +static int raw_reopen_handle_lock(BDRVReopenState *state, > > + RawReopenOperation op, > > + Error **errp) > > +{ > > + BDRVRawReopenState *raw_s = state->opaque; > > Please choose another name, it's hard not to confuse this with the > BDRVRawState all the time. (e.g. raw_rs or just rs would be enough.) Sorry I can't change the name in this patch, it will cause more inconsistency because raw_reopen_* already use the name broadly. If you really insist it's a bad name, I can append or prepend a patch in the next version. > > (Same in the places above, I got confused more than once...) > > > + BDRVRawState *s = state->bs->opaque; > > + ImageLockMode old_lock, new_lock; > > + const struct RawReopenFuncRecord *rec; > > + int ret; > > + > > + old_lock = bdrv_get_lock_mode(state->bs); > > + new_lock = bdrv_lock_mode_from_flags(state->flags); > > + > > + if (old_lock == IMAGE_LOCK_MODE__MAX) { > > + /* bs was not locked, leave it unlocked. */ > > + old_lock = new_lock = IMAGE_LOCK_MODE_NOLOCK; > > + } > > + > > + if (old_lock == IMAGE_LOCK_MODE_AUTO) { > > + old_lock = bdrv_get_flags(state->bs) & BDRV_O_RDWR ? > > + IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED; > > + } > > + > > + if (new_lock == IMAGE_LOCK_MODE_AUTO) { > > + new_lock = state->flags & BDRV_O_RDWR ? > > + IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED; > > + } > > + > > + for (rec = &reopen_functions[0]; > > + rec < &reopen_functions[ARRAY_SIZE(reopen_functions)]; > > + rec++) { > > + if (rec->old_lock == old_lock && rec->new_lock == new_lock) { > > + break; > > + } > > + } > > + assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]); > > + > > + switch (op) { > > + case RAW_REOPEN_PREPARE: > > + if (rec->need_lock_fd) { > > + ret = qemu_dup(raw_s->fd); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Failed to dup new fd"); > > + return ret; > > + } > > + raw_s->lock_fd = ret; > > + } > > + return rec->func(state, op, old_lock, new_lock, errp); > > + case RAW_REOPEN_COMMIT: > > + rec->func(state, op, old_lock, new_lock, errp); > > + if (rec->need_lock_fd) { > > + if (s->lock_fd >= 0) { > > + qemu_close(s->lock_fd); > > + } > > + s->lock_fd = raw_s->lock_fd; > > + } > > + break; > > + case RAW_REOPEN_ABORT: > > + rec->func(state, op, old_lock, new_lock, errp); > > + if (rec->need_lock_fd && raw_s->lock_fd >= 0) { > > + qemu_close(raw_s->lock_fd); > > + raw_s->lock_fd = -1; > > + } > > + break; > > + } > > + return 0; > > +} > > + > > static int raw_reopen_prepare(BDRVReopenState *state, > > BlockReopenQueue *queue, Error **errp) > > { > > @@ -607,6 +909,10 @@ static int raw_reopen_prepare(BDRVReopenState *state, > > } > > } > > > > + if (!ret) { > > + ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp); > > + } > > + > > return ret; > > } > > > > @@ -617,6 +923,8 @@ static void raw_reopen_commit(BDRVReopenState *state) > > > > s->open_flags = raw_s->open_flags; > > > > + raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL); > > I'd prefer &error_abort instead of NULL. OK. > > > + > > qemu_close(s->fd); > > s->fd = raw_s->fd; > > > > @@ -634,6 +942,8 @@ static void raw_reopen_abort(BDRVReopenState *state) > > return; > > } > > > > + raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL); > > Same here. OK. > > > + > > if (raw_s->fd >= 0) { > > qemu_close(raw_s->fd); > > raw_s->fd = -1; > > @@ -1321,6 +1631,10 @@ static void raw_close(BlockDriverState *bs) > > qemu_close(s->fd); > > s->fd = -1; > > } > > + if (s->lock_fd >= 0) { > > + qemu_close(s->lock_fd); > > + s->lock_fd = -1; > > + } > > } > > > > static int raw_truncate(BlockDriverState *bs, int64_t offset) > > @@ -1874,7 +2188,7 @@ BlockDriver bdrv_file = { > > .bdrv_get_info = raw_get_info, > > .bdrv_get_allocated_file_size > > = raw_get_allocated_file_size, > > - > > + .bdrv_lockf = raw_lockf, > > .create_opts = &raw_create_opts, > > }; > > > > @@ -2324,6 +2638,8 @@ static BlockDriver bdrv_host_device = { > > #ifdef __linux__ > > .bdrv_aio_ioctl = hdev_aio_ioctl, > > #endif > > + > > + .bdrv_lockf = raw_lockf, > > }; > > > > #if defined(__linux__) || defined(__FreeBSD__) || > > defined(__FreeBSD_kernel__) > > > > Overall design looks good to me. Unfortunately we can't strictly follow > the transaction principle for downgrades and upgrades, but I guess it'll > work most of the time (fingers crossed). Thanks for not yelling at this scary patch. Fam