Re: [Qemu-devel] [PATCH v6 08/22] raw-posix: Add image locking support

Kevin Wolf Fri, 17 Jun 2016 06:22:19 -0700

Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben:
> virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
> the intervene.
> 
> Both file and host device protocols are covered.
> 
> The complication is with reopen. We have three different locking states,
> namely "unlocked", "shared locked" and "exclusively locked".
> 
> There have three different states, "unlocked", "shared locked" and 
> "exclusively
> locked".


This seems to be a corrupted copy of the previous sentence. :-)

> When we reopen, the new fd may need a new locking mode. Moving away to
> or from exclusive is a bit tricky because we cannot do it atomically. This
> patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there
> isn't a racy window where we drop the lock on one fd before acquiring the
> exclusive lock on the other.
> 
> To make the logic easier to manage, and allow better reuse, the code is
> internally organized by state transition table (old_lock -> new_lock).
> 
> Signed-off-by: Fam Zheng <f...@redhat.com>

I must admit that I don't fully understand yet why we can't change the
lock atomincally and how s->lock_fd helps. In any case, I think it
deserves comments in the code and not only in the commit message.

So I'm not giving a full review here, but I think I have one important
point to make at least.

>  block/raw-posix.c | 285 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 285 insertions(+)
> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index bb8669f..6347350 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -133,6 +133,7 @@ do { \
>  
>  typedef struct BDRVRawState {
>      int fd;
> +    int lock_fd;
>      int type;
>      int open_flags;
>      size_t buf_align;
> @@ -153,6 +154,7 @@ typedef struct BDRVRawState {
>  
>  typedef struct BDRVRawReopenState {
>      int fd;
> +    int lock_fd;
>      int open_flags;
>  #ifdef CONFIG_LINUX_AIO
>      int use_aio;
> @@ -397,6 +399,37 @@ static void raw_attach_aio_context(BlockDriverState *bs,
>  #endif
>  }
>  
> +static int raw_lockf_fd(int fd, BdrvLockfCmd cmd)
> +{
> +    assert(fd >= 0);
> +    /* Locking byte 1 avoids interfereing with virtlockd. */
> +    switch (cmd) {
> +    case BDRV_LOCKF_EXCLUSIVE:
> +        return qemu_lock_fd(fd, 1, 1, true);
> +    case BDRV_LOCKF_SHARED:
> +        return qemu_lock_fd(fd, 1, 1, false);
> +    case BDRV_LOCKF_UNLOCK:
> +        return qemu_unlock_fd(fd, 1, 1);
> +    default:
> +        abort();
> +    }
> +}
> +
> +static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd)
> +{
> +
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (s->lock_fd < 0) {
> +        s->lock_fd = qemu_dup(s->fd);
> +        if (s->lock_fd < 0) {
> +            return s->lock_fd;
> +        }
> +    }
> +
> +    return raw_lockf_fd(s->lock_fd, cmd);
> +}
> +
>  #ifdef CONFIG_LINUX_AIO
>  static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
>  {
> @@ -483,6 +516,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>      raw_parse_flags(bdrv_flags, &s->open_flags);
>  
>      s->fd = -1;
> +    s->lock_fd = -1;
>      fd = qemu_open(filename, s->open_flags, 0644);
>      if (fd < 0) {
>          ret = -errno;
> @@ -593,6 +627,241 @@ static int raw_open(BlockDriverState *bs, QDict 
> *options, int flags,
>      return ret;
>  }
>  
> +typedef enum {
> +    RAW_REOPEN_PREPARE,
> +    RAW_REOPEN_COMMIT,
> +    RAW_REOPEN_ABORT
> +} RawReopenOperation;
> +
> +typedef int (*RawReopenFunc)(BDRVReopenState *state,
> +                             RawReopenOperation op,
> +                             BdrvLockfCmd old_lock,
> +                             BdrvLockfCmd new_lock,
> +                             Error **errp);
> +
> +static
> +int raw_reopen_identical(BDRVReopenState *state,

This is unusual formatting. I'm used to having everything on a single
line or "static int" on its own line, but breaking between "static" and
"int" feels odd.

> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    assert(old_lock == new_lock);
> +    return 0;
> +}
> +
> +static
> +int raw_reopen_from_unlock(BDRVReopenState *state,
> +                           RawReopenOperation op,
> +                           BdrvLockfCmd old_lock,
> +                           BdrvLockfCmd new_lock,
> +                           Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    int ret = 0;
> +
> +    assert(old_lock != new_lock);
> +    assert(old_lock == BDRV_LOCKF_UNLOCK);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        ret = raw_lockf_fd(raw_s->lock_fd, new_lock);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd");
> +        }
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static
> +int raw_reopen_to_unlock(BDRVReopenState *state,
> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret = 0;
> +
> +    assert(old_lock != new_lock);
> +    assert(old_lock == BDRV_LOCKF_UNLOCK);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        if (s->lock_fd >= 0) {
> +            qemu_close(s->lock_fd);
> +            s->lock_fd = -1;
> +        }
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static
> +int raw_reopen_upgrade(BDRVReopenState *state,
> +                       RawReopenOperation op,
> +                       BdrvLockfCmd old_lock,
> +                       BdrvLockfCmd new_lock,
> +                       Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret = 0, ret2;
> +
> +    assert(old_lock == BDRV_LOCKF_SHARED);
> +    assert(new_lock == BDRV_LOCKF_EXCLUSIVE);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
> +            break;
> +        }
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to unlock old fd");
> +            goto restore;
> +        }
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd 
> (exclusive)");
> +            goto restore;
> +        }
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to restore lock on old fd");
> +        }
> +        break;
> +    }
> +
> +    return ret;
> +restore:
> +    ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +    if (ret2) {
> +        error_report("Failed to restore old lock");
> +    }
> +    return ret;
> +
> +}

That final empty line doesn't look intentional.

> +
> +static
> +int raw_reopen_downgrade(BDRVReopenState *state,
> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret;
> +
> +    assert(old_lock == BDRV_LOCKF_EXCLUSIVE);
> +    assert(new_lock == BDRV_LOCKF_SHARED);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to downgrade old lock");
> +            break;
> +        }
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to lock new fd");
> +            break;
> +        }
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static const struct RawReopenFuncRecord {
> +    BdrvLockfCmd old_lock;
> +    BdrvLockfCmd new_lock;
> +    RawReopenFunc func;
> +    bool need_lock_fd;
> +} reopen_functions[] = {
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false},
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true},
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, 
> false},
> +};
> +
> +static int raw_reopen_handle_lock(BDRVReopenState *state,
> +                                  RawReopenOperation op,
> +                                  Error **errp)

I think we have one big problem here: We don't know whether raw_s->fd is
already locked or not. If dup() and setting the new flags with fcntl()
succeeded, it is, but if we had to fall back on qemu_open(), it isn't.

This means that doing nothing in the raw_reopen_identical case isn't
right because reopening without intending to change anything about the
locking could end up unlocking the image.

Kevin

Re: [Qemu-devel] [PATCH v6 08/22] raw-posix: Add image locking support

Reply via email to