Re: [Qemu-block] [PATCH v8 05/36] raw-posix: Add image locking support

Fam Zheng Mon, 24 Oct 2016 23:32:28 -0700

On Sat, 10/22 01:40, Max Reitz wrote:
> On 30.09.2016 14:09, Fam Zheng wrote:
> > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
> > the intervene.
> 
> s/the intervene/a conflict/?


OK.

> 
> > 
> > Both file and host device protocols are covered.
> > 
> > The complication is with reopen. We have three different locking states,
> > namely "unlocked", "shared locked" and "exclusively locked".
> > 
> > When we reopen, the new fd may need a new locking mode. Moving away to or 
> > from
> > exclusive is a bit tricky because we cannot do it atomically. This patch 
> > solves
> > it by dup() s->fd to s->lock_fd and avoid close(), so that there isn't a 
> > racy
> > window where we drop the lock on one fd before acquiring the exclusive lock 
> > on
> > the other.
> > 
> > To make the logic easier to manage, and allow better reuse, the code is
> > internally organized by state transition table (old_lock -> new_lock).
> > 
> > Signed-off-by: Fam Zheng <f...@redhat.com>
> > ---
> >  block/raw-posix.c | 318 
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 317 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/raw-posix.c b/block/raw-posix.c
> > index 6ed7547..22de242 100644
> > --- a/block/raw-posix.c
> > +++ b/block/raw-posix.c
> > @@ -133,6 +133,7 @@ do { \
> >  
> >  typedef struct BDRVRawState {
> >      int fd;
> > +    int lock_fd;
> 
> I think it would be a good idea to put a comment about the semantics of
> this lock_fd here.

Will do.

> 
> >      int type;
> >      int open_flags;
> >      size_t buf_align;
> > @@ -149,6 +150,7 @@ typedef struct BDRVRawState {
> >  
> >  typedef struct BDRVRawReopenState {
> >      int fd;
> > +    int lock_fd;
> >      int open_flags;
> >  } BDRVRawReopenState;
> >  
> > @@ -367,6 +369,43 @@ static void raw_parse_flags(int bdrv_flags, int 
> > *open_flags)
> >      }
> >  }
> >  
> > +static int raw_lock_fd(int fd, ImageLockMode mode)
> > +{
> > +    assert(fd >= 0);
> > +    /* Locking byte 1 avoids interfereing with virtlockd. */
> > +    switch (mode) {
> > +    case IMAGE_LOCK_MODE_EXCLUSIVE:
> > +        return qemu_lock_fd(fd, 1, 1, true);
> > +    case IMAGE_LOCK_MODE_SHARED:
> > +        return qemu_lock_fd(fd, 1, 1, false);
> > +    case IMAGE_LOCK_MODE_NOLOCK:
> > +        return qemu_unlock_fd(fd, 1, 1);
> > +    default:
> > +        abort();
> > +    }
> > +}
> > +
> > +static int raw_lockf(BlockDriverState *bs, ImageLockMode mode)
> > +{
> > +    BDRVRawState *s = bs->opaque;
> > +
> > +    if (s->lock_fd < 0) {
> > +        if (mode == IMAGE_LOCK_MODE_NOLOCK) {
> > +            return 0;
> > +        }
> > +        s->lock_fd = qemu_dup(s->fd);
> > +        if (s->lock_fd < 0) {
> > +            return s->lock_fd;
> 
> You should probably return -errno instead (qemu_dup apparently doesn't
> return that but just -1).

Yes.

> 
> (I'm getting further and further from a high-level review, am I not?)
> 
> > +        }
> > +    }
> > +    if (mode == IMAGE_LOCK_MODE_AUTO) {
> > +        mode = bdrv_get_flags(bs) & BDRV_O_RDWR ?
> > +               IMAGE_LOCK_MODE_EXCLUSIVE :
> > +               IMAGE_LOCK_MODE_SHARED;
> > +    }
> > +    return raw_lock_fd(s->lock_fd, mode);
> 
> Without a comment for how lock_fd is supposed to work, this (for
> example) looks a bit weird. After all, the user is trying to lock the
> fd, and I don't find it immediately clear that lock_fd always points to
> the same file as fd, and you're using it only for locking (for reasons
> the comment should explain as well).
> 
> > +}
> > +
> >  #ifdef CONFIG_LINUX_AIO
> >  static bool raw_use_aio(int bdrv_flags)
> >  {
> > @@ -433,6 +472,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >      raw_parse_flags(bdrv_flags, &s->open_flags);
> >  
> >      s->fd = -1;
> > +    s->lock_fd = -1;
> >      fd = qemu_open(filename, s->open_flags, 0644);
> >      if (fd < 0) {
> >          ret = -errno;
> > @@ -529,6 +569,268 @@ static int raw_open(BlockDriverState *bs, QDict 
> > *options, int flags,
> >      return raw_open_common(bs, options, flags, 0, errp);
> >  }
> >  
> > +typedef enum {
> > +    RAW_REOPEN_PREPARE,
> > +    RAW_REOPEN_COMMIT,
> > +    RAW_REOPEN_ABORT
> > +} RawReopenOperation;
> > +
> > +typedef int (*RawReopenFunc)(BDRVReopenState *state,
> > +                             RawReopenOperation op,
> > +                             ImageLockMode old_lock,
> > +                             ImageLockMode new_lock,
> > +                             Error **errp);
> > +
> > +static int
> > +raw_reopen_identical(BDRVReopenState *state,
> > +                     RawReopenOperation op,
> > +                     ImageLockMode old_lock,
> > +                     ImageLockMode new_lock,
> > +                     Error **errp)
> > +{
> > +    assert(old_lock == new_lock);
> > +    return 0;
> > +}
> > +
> > +static int
> > +raw_reopen_from_unlock(BDRVReopenState *state,
> > +                       RawReopenOperation op,
> > +                       ImageLockMode old_lock,
> > +                       ImageLockMode new_lock,
> > +                       Error **errp)
> > +{
> > +    BDRVRawReopenState *raw_s = state->opaque;
> > +    int ret = 0;
> > +
> > +    assert(old_lock != new_lock);
> > +    assert(old_lock == IMAGE_LOCK_MODE_NOLOCK);
> > +    switch (op) {
> > +    case RAW_REOPEN_PREPARE:
> > +        ret = raw_lock_fd(raw_s->lock_fd, new_lock);
> > +        if (ret) {
> > +            error_setg_errno(errp, -ret, "Failed to lock new fd %d", 
> > raw_s->lock_fd);
> > +        }
> > +        break;
> > +    case RAW_REOPEN_COMMIT:
> > +    case RAW_REOPEN_ABORT:
> > +        break;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> > +static int
> > +raw_reopen_to_unlock(BDRVReopenState *state,
> > +                     RawReopenOperation op,
> > +                     ImageLockMode old_lock,
> > +                     ImageLockMode new_lock,
> > +                     Error **errp)
> > +{
> > +    BDRVRawState *s = state->bs->opaque;
> > +    int ret = 0;
> > +
> > +    assert(old_lock != new_lock);
> > +    assert(new_lock == IMAGE_LOCK_MODE_NOLOCK);
> > +    switch (op) {
> > +    case RAW_REOPEN_PREPARE:
> > +        break;
> > +    case RAW_REOPEN_COMMIT:
> > +        if (s->lock_fd >= 0) {
> > +            qemu_close(s->lock_fd);
> > +            s->lock_fd = -1;
> > +        }
> > +        break;
> > +    case RAW_REOPEN_ABORT:
> > +        break;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> > +static int
> > +raw_reopen_upgrade(BDRVReopenState *state,
> > +                   RawReopenOperation op,
> > +                   ImageLockMode old_lock,
> > +                   ImageLockMode new_lock,
> > +                   Error **errp)
> > +{
> > +    BDRVRawReopenState *raw_s = state->opaque;
> > +    BDRVRawState *s = state->bs->opaque;
> > +    int ret = 0, ret2;
> > +
> > +    assert(old_lock == IMAGE_LOCK_MODE_SHARED);
> > +    assert(new_lock == IMAGE_LOCK_MODE_EXCLUSIVE);
> > +    switch (op) {
> > +    case RAW_REOPEN_PREPARE:
> > +        ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);

[1]

> > +        if (ret) {
> > +            error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
> > +            break;
> > +        }
> > +        ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_NOLOCK);

[2]

> > +        if (ret) {
> > +            error_setg_errno(errp, -ret, "Failed to unlock old fd");
> > +            goto restore;
> > +        }
> > +        ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_EXCLUSIVE);

[3]

> > +        if (ret) {
> > +            error_setg_errno(errp, -ret, "Failed to lock new fd 
> > (exclusive)");
> > +            goto restore;
> > +        }
> > +        break;
> > +    case RAW_REOPEN_COMMIT:
> > +        break;
> > +    case RAW_REOPEN_ABORT:
> > +        raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> > +        ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);

[4]

> > +        if (ret) {
> > +            error_report("Failed to restore lock on old fd");
> 
> If we get here, s->lock_fd is still locked exclusively. The following is
> a very personal opinion, but anyway: I think it would be be better for
> it to be unlocked. If it's locked too strictly, this can really break
> something; but if it's just not locked (while it should be locked in
> shared mode), everything's going to be fine unless the user makes a
> mistake. I think the latter is less bad.

There are four lock states when we land on this "abort" branch:

  a) Lock "prepare" was not run, some other error happened earlier, so the lock
     aren't changed compared to before the transaction starts: raw_s->lock_fd is
     unlocked, s->lock_fd is shared locked. In this case raw_lock_fd [4] cannot
     fail, and even if it does, s->lock_fd is in the correct state.

  b) The raw_lock_fd [1] failed. This is similar to 1), s->lock_fd is intact, so
     we are good.

  c) The raw_lock_fd [2] failed. Again, similar to above.

  d) The raw_lock_fd [3] failed. Here raw_s->lock_fd is shared locked, and
     s->lock_fd is unlocked. The correct "abort" sequence is downgrade
     raw_s->lock_fd and "shared lock" s->lock_fd again. If the "abort" sequence
     failed, s->lock_fd is unlocked.

> 
> And we can always achieve unlocking the FD by closing s->lock_fd, so I
> think that is what we should do here.

So I think this already does what you want.

> 
> > +        }
> > +        break;
> > +    }
> > +
> > +    return ret;
> > +restore:
> > +    ret2 = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> > +    if (ret2) {
> > +        error_report("Failed to restore old lock");
> > +    }
> > +    return ret;
> > +}
> > +
> > +static int
> > +raw_reopen_downgrade(BDRVReopenState *state,
> > +                     RawReopenOperation op,
> > +                     ImageLockMode old_lock,
> > +                     ImageLockMode new_lock,
> > +                     Error **errp)
> > +{
> > +    BDRVRawReopenState *raw_s = state->opaque;
> > +    BDRVRawState *s = state->bs->opaque;
> > +    int ret = 0;
> > +
> > +    assert(old_lock == IMAGE_LOCK_MODE_EXCLUSIVE);
> > +    assert(new_lock == IMAGE_LOCK_MODE_SHARED);
> > +    switch (op) {
> > +    case RAW_REOPEN_PREPARE:
> > +        break;
> > +    case RAW_REOPEN_COMMIT:
> > +        ret = raw_lock_fd(s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> > +        if (ret) {
> > +            error_report("Failed to downgrade old lock");
> > +            break;
> > +        }
> > +        ret = raw_lock_fd(raw_s->lock_fd, IMAGE_LOCK_MODE_SHARED);
> > +        if (ret) {
> > +            error_report("Failed to lock new fd (shared)");
> > +            break;
> > +        }
> > +        break;
> 
> It's really unfortunate that we cannot do anything in prepare and have
> to hope for the best in commit. But I can't come up with anything better.
> 
> And if something does fail, the FD will be unlocked after reopening.
> That's good enough for me.
> 
> > +    case RAW_REOPEN_ABORT:
> > +        break;
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> > +/**
> > + * Transactionally moving between three possible locking states is tricky 
> > and
> > + * must be done carefully. That is mostly because downgrading an exclusive 
> > lock
> > + * to shared or unlocked is not guaranteed to be revertable. As a result, 
> > in
> > + * such cases we have to defer the downgraing to "commit", given that no 
> > revert
> > + * will happen after that point, and that downgrading a lock should never 
> > fail.
> > + *
> > + * On the other hand, upgrading a lock (e.g. from unlocked or shared to
> > + * exclusive lock) must happen in "prepare" because it may fail.
> > + *
> > + * Manage the operation matrix with this state transition table to make
> > + * fulfulling above conditions easier.
> > + */
> > +static const struct RawReopenFuncRecord {
> > +    ImageLockMode old_lock;
> > +    ImageLockMode new_lock;
> > +    RawReopenFunc func;
> > +    bool need_lock_fd;
> > +} reopen_functions[] = {
> > +    {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_identical, 
> > false},
> > +    {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_SHARED, 
> > raw_reopen_from_unlock, true},
> > +    {IMAGE_LOCK_MODE_NOLOCK, IMAGE_LOCK_MODE_EXCLUSIVE, 
> > raw_reopen_from_unlock, true},
> > +    {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_NOLOCK, raw_reopen_to_unlock, 
> > false},
> > +    {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_SHARED, raw_reopen_identical, 
> > false},
> > +    {IMAGE_LOCK_MODE_SHARED, IMAGE_LOCK_MODE_EXCLUSIVE, 
> > raw_reopen_upgrade, true},
> > +    {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_NOLOCK, 
> > raw_reopen_to_unlock, false},
> > +    {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_SHARED, 
> > raw_reopen_downgrade, true},
> > +    {IMAGE_LOCK_MODE_EXCLUSIVE, IMAGE_LOCK_MODE_EXCLUSIVE, 
> > raw_reopen_identical, false},
> > +};
> > +
> > +static int raw_reopen_handle_lock(BDRVReopenState *state,
> > +                                  RawReopenOperation op,
> > +                                  Error **errp)
> > +{
> > +    BDRVRawReopenState *raw_s = state->opaque;
> 
> Please choose another name, it's hard not to confuse this with the
> BDRVRawState all the time. (e.g. raw_rs or just rs would be enough.)

Sorry I can't change the name in this patch, it will cause more inconsistency
because raw_reopen_* already use the name broadly. If you really insist it's a
bad name, I can append or prepend a patch in the next version.

> 
> (Same in the places above, I got confused more than once...)
> 
> > +    BDRVRawState *s = state->bs->opaque;
> > +    ImageLockMode old_lock, new_lock;
> > +    const struct RawReopenFuncRecord *rec;
> > +    int ret;
> > +
> > +    old_lock = bdrv_get_lock_mode(state->bs);
> > +    new_lock = bdrv_lock_mode_from_flags(state->flags);
> > +
> > +    if (old_lock == IMAGE_LOCK_MODE__MAX) {
> > +        /* bs was not locked, leave it unlocked. */
> > +        old_lock = new_lock = IMAGE_LOCK_MODE_NOLOCK;
> > +    }
> > +
> > +    if (old_lock == IMAGE_LOCK_MODE_AUTO) {
> > +        old_lock = bdrv_get_flags(state->bs) & BDRV_O_RDWR ?
> > +                   IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED;
> > +    }
> > +
> > +    if (new_lock == IMAGE_LOCK_MODE_AUTO) {
> > +        new_lock = state->flags & BDRV_O_RDWR ?
> > +                   IMAGE_LOCK_MODE_EXCLUSIVE : IMAGE_LOCK_MODE_SHARED;
> > +    }
> > +
> > +    for (rec = &reopen_functions[0];
> > +         rec < &reopen_functions[ARRAY_SIZE(reopen_functions)];
> > +         rec++) {
> > +        if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
> > +            break;
> > +        }
> > +    }
> > +    assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]);
> > +
> > +    switch (op) {
> > +    case RAW_REOPEN_PREPARE:
> > +        if (rec->need_lock_fd) {
> > +            ret = qemu_dup(raw_s->fd);
> > +            if (ret < 0) {
> > +                error_setg_errno(errp, -ret, "Failed to dup new fd");
> > +                return ret;
> > +            }
> > +            raw_s->lock_fd = ret;
> > +        }
> > +        return rec->func(state, op, old_lock, new_lock, errp);
> > +    case RAW_REOPEN_COMMIT:
> > +        rec->func(state, op, old_lock, new_lock, errp);
> > +        if (rec->need_lock_fd) {
> > +            if (s->lock_fd >= 0) {
> > +                qemu_close(s->lock_fd);
> > +            }
> > +            s->lock_fd = raw_s->lock_fd;
> > +        }
> > +        break;
> > +    case RAW_REOPEN_ABORT:
> > +        rec->func(state, op, old_lock, new_lock, errp);
> > +        if (rec->need_lock_fd && raw_s->lock_fd >= 0) {
> > +            qemu_close(raw_s->lock_fd);
> > +            raw_s->lock_fd = -1;
> > +        }
> > +        break;
> > +    }
> > +    return 0;
> > +}
> > +
> >  static int raw_reopen_prepare(BDRVReopenState *state,
> >                                BlockReopenQueue *queue, Error **errp)
> >  {
> > @@ -607,6 +909,10 @@ static int raw_reopen_prepare(BDRVReopenState *state,
> >          }
> >      }
> >  
> > +    if (!ret) {
> > +        ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp);
> > +    }
> > +
> >      return ret;
> >  }
> >  
> > @@ -617,6 +923,8 @@ static void raw_reopen_commit(BDRVReopenState *state)
> >  
> >      s->open_flags = raw_s->open_flags;
> >  
> > +    raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL);
> 
> I'd prefer &error_abort instead of NULL.

OK.

> 
> > +
> >      qemu_close(s->fd);
> >      s->fd = raw_s->fd;
> >  
> > @@ -634,6 +942,8 @@ static void raw_reopen_abort(BDRVReopenState *state)
> >          return;
> >      }
> >  
> > +    raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL);
> 
> Same here.

OK.

> 
> > +
> >      if (raw_s->fd >= 0) {
> >          qemu_close(raw_s->fd);
> >          raw_s->fd = -1;
> > @@ -1321,6 +1631,10 @@ static void raw_close(BlockDriverState *bs)
> >          qemu_close(s->fd);
> >          s->fd = -1;
> >      }
> > +    if (s->lock_fd >= 0) {
> > +        qemu_close(s->lock_fd);
> > +        s->lock_fd = -1;
> > +    }
> >  }
> >  
> >  static int raw_truncate(BlockDriverState *bs, int64_t offset)
> > @@ -1874,7 +2188,7 @@ BlockDriver bdrv_file = {
> >      .bdrv_get_info = raw_get_info,
> >      .bdrv_get_allocated_file_size
> >                          = raw_get_allocated_file_size,
> > -
> > +    .bdrv_lockf = raw_lockf,
> >      .create_opts = &raw_create_opts,
> >  };
> >  
> > @@ -2324,6 +2638,8 @@ static BlockDriver bdrv_host_device = {
> >  #ifdef __linux__
> >      .bdrv_aio_ioctl     = hdev_aio_ioctl,
> >  #endif
> > +
> > +    .bdrv_lockf = raw_lockf,
> >  };
> >  
> >  #if defined(__linux__) || defined(__FreeBSD__) || 
> > defined(__FreeBSD_kernel__)
> > 
> 
> Overall design looks good to me. Unfortunately we can't strictly follow
> the transaction principle for downgrades and upgrades, but I guess it'll
> work most of the time (fingers crossed).

Thanks for not yelling at this scary patch.

Fam

Re: [Qemu-block] [PATCH v8 05/36] raw-posix: Add image locking support

Reply via email to