Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben: > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid > the intervene. > > Both file and host device protocols are covered. > > The complication is with reopen. We have three different locking states, > namely "unlocked", "shared locked" and "exclusively locked". > > There have three different states, "unlocked", "shared locked" and > "exclusively > locked".
This seems to be a corrupted copy of the previous sentence. :-) > When we reopen, the new fd may need a new locking mode. Moving away to > or from exclusive is a bit tricky because we cannot do it atomically. This > patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there > isn't a racy window where we drop the lock on one fd before acquiring the > exclusive lock on the other. > > To make the logic easier to manage, and allow better reuse, the code is > internally organized by state transition table (old_lock -> new_lock). > > Signed-off-by: Fam Zheng <f...@redhat.com> I must admit that I don't fully understand yet why we can't change the lock atomincally and how s->lock_fd helps. In any case, I think it deserves comments in the code and not only in the commit message. So I'm not giving a full review here, but I think I have one important point to make at least. > block/raw-posix.c | 285 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 285 insertions(+) > > diff --git a/block/raw-posix.c b/block/raw-posix.c > index bb8669f..6347350 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -133,6 +133,7 @@ do { \ > > typedef struct BDRVRawState { > int fd; > + int lock_fd; > int type; > int open_flags; > size_t buf_align; > @@ -153,6 +154,7 @@ typedef struct BDRVRawState { > > typedef struct BDRVRawReopenState { > int fd; > + int lock_fd; > int open_flags; > #ifdef CONFIG_LINUX_AIO > int use_aio; > @@ -397,6 +399,37 @@ static void raw_attach_aio_context(BlockDriverState *bs, > #endif > } > > +static int raw_lockf_fd(int fd, BdrvLockfCmd cmd) > +{ > + assert(fd >= 0); > + /* Locking byte 1 avoids interfereing with virtlockd. */ > + switch (cmd) { > + case BDRV_LOCKF_EXCLUSIVE: > + return qemu_lock_fd(fd, 1, 1, true); > + case BDRV_LOCKF_SHARED: > + return qemu_lock_fd(fd, 1, 1, false); > + case BDRV_LOCKF_UNLOCK: > + return qemu_unlock_fd(fd, 1, 1); > + default: > + abort(); > + } > +} > + > +static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd) > +{ > + > + BDRVRawState *s = bs->opaque; > + > + if (s->lock_fd < 0) { > + s->lock_fd = qemu_dup(s->fd); > + if (s->lock_fd < 0) { > + return s->lock_fd; > + } > + } > + > + return raw_lockf_fd(s->lock_fd, cmd); > +} > + > #ifdef CONFIG_LINUX_AIO > static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags) > { > @@ -483,6 +516,7 @@ static int raw_open_common(BlockDriverState *bs, QDict > *options, > raw_parse_flags(bdrv_flags, &s->open_flags); > > s->fd = -1; > + s->lock_fd = -1; > fd = qemu_open(filename, s->open_flags, 0644); > if (fd < 0) { > ret = -errno; > @@ -593,6 +627,241 @@ static int raw_open(BlockDriverState *bs, QDict > *options, int flags, > return ret; > } > > +typedef enum { > + RAW_REOPEN_PREPARE, > + RAW_REOPEN_COMMIT, > + RAW_REOPEN_ABORT > +} RawReopenOperation; > + > +typedef int (*RawReopenFunc)(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp); > + > +static > +int raw_reopen_identical(BDRVReopenState *state, This is unusual formatting. I'm used to having everything on a single line or "static int" on its own line, but breaking between "static" and "int" feels odd. > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + assert(old_lock == new_lock); > + return 0; > +} > + > +static > +int raw_reopen_from_unlock(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + int ret = 0; > + > + assert(old_lock != new_lock); > + assert(old_lock == BDRV_LOCKF_UNLOCK); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + ret = raw_lockf_fd(raw_s->lock_fd, new_lock); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd"); > + } > + break; > + case RAW_REOPEN_COMMIT: > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static > +int raw_reopen_to_unlock(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawState *s = state->bs->opaque; > + int ret = 0; > + > + assert(old_lock != new_lock); > + assert(old_lock == BDRV_LOCKF_UNLOCK); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + break; > + case RAW_REOPEN_COMMIT: > + if (s->lock_fd >= 0) { > + qemu_close(s->lock_fd); > + s->lock_fd = -1; > + } > + break; > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static > +int raw_reopen_upgrade(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + BDRVRawState *s = state->bs->opaque; > + int ret = 0, ret2; > + > + assert(old_lock == BDRV_LOCKF_SHARED); > + assert(new_lock == BDRV_LOCKF_EXCLUSIVE); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd (shared)"); > + break; > + } > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to unlock old fd"); > + goto restore; > + } > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd > (exclusive)"); > + goto restore; > + } > + break; > + case RAW_REOPEN_COMMIT: > + break; > + case RAW_REOPEN_ABORT: > + raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to restore lock on old fd"); > + } > + break; > + } > + > + return ret; > +restore: > + ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret2) { > + error_report("Failed to restore old lock"); > + } > + return ret; > + > +} That final empty line doesn't look intentional. > + > +static > +int raw_reopen_downgrade(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + BDRVRawState *s = state->bs->opaque; > + int ret; > + > + assert(old_lock == BDRV_LOCKF_EXCLUSIVE); > + assert(new_lock == BDRV_LOCKF_SHARED); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + break; > + case RAW_REOPEN_COMMIT: > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to downgrade old lock"); > + break; > + } > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to lock new fd"); > + break; > + } > + break; > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static const struct RawReopenFuncRecord { > + BdrvLockfCmd old_lock; > + BdrvLockfCmd new_lock; > + RawReopenFunc func; > + bool need_lock_fd; > +} reopen_functions[] = { > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false}, > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true}, > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, > false}, > +}; > + > +static int raw_reopen_handle_lock(BDRVReopenState *state, > + RawReopenOperation op, > + Error **errp) I think we have one big problem here: We don't know whether raw_s->fd is already locked or not. If dup() and setting the new flags with fcntl() succeeded, it is, but if we had to fall back on qemu_open(), it isn't. This means that doing nothing in the raw_reopen_identical case isn't right because reopening without intending to change anything about the locking could end up unlocking the image. Kevin