> On Feb 6, 2023, at 11:07 AM, Alex Williamson <alex.william...@redhat.com> > wrote: > > On Wed, 1 Feb 2023 21:55:48 -0800 > John Johnson <john.g.john...@oracle.com> wrote: > >> Add support for posted writes on remote devices >> >> Signed-off-by: Elena Ufimtseva <elena.ufimts...@oracle.com> >> Signed-off-by: John G Johnson <john.g.john...@oracle.com> >> Signed-off-by: Jagannathan Raman <jag.ra...@oracle.com> >> --- >> hw/vfio/user-protocol.h | 12 +++++ >> hw/vfio/user.h | 1 + >> include/hw/vfio/vfio-common.h | 3 +- >> hw/vfio/common.c | 23 ++++++--- >> hw/vfio/pci.c | 5 +- >> hw/vfio/user-pci.c | 5 ++ >> hw/vfio/user.c | 112 >> ++++++++++++++++++++++++++++++++++++++++++ >> hw/vfio/trace-events | 1 + >> 8 files changed, 154 insertions(+), 8 deletions(-) >> >> diff --git a/hw/vfio/user-protocol.h b/hw/vfio/user-protocol.h >> index 6f70a48..6987435 100644 >> --- a/hw/vfio/user-protocol.h >> +++ b/hw/vfio/user-protocol.h >> @@ -139,4 +139,16 @@ typedef struct { >> uint64_t offset; >> } VFIOUserRegionInfo; >> >> +/* >> + * VFIO_USER_REGION_READ >> + * VFIO_USER_REGION_WRITE >> + */ >> +typedef struct { >> + VFIOUserHdr hdr; >> + uint64_t offset; >> + uint32_t region; >> + uint32_t count; >> + char data[]; >> +} VFIOUserRegionRW; >> + >> #endif /* VFIO_USER_PROTOCOL_H */ >> diff --git a/hw/vfio/user.h b/hw/vfio/user.h >> index e6485dc..3012a86 100644 >> --- a/hw/vfio/user.h >> +++ b/hw/vfio/user.h >> @@ -84,6 +84,7 @@ typedef struct VFIOUserProxy { >> /* VFIOProxy flags */ >> #define VFIO_PROXY_CLIENT 0x1 >> #define VFIO_PROXY_FORCE_QUEUED 0x4 >> +#define VFIO_PROXY_NO_POST 0x8 >> >> VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp); >> void vfio_user_disconnect(VFIOUserProxy *proxy); >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index 9fb4c80..bbc4b15 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -57,6 +57,7 @@ typedef struct VFIORegion { >> VFIOMmap *mmaps; >> uint8_t nr; /* cache the region number for debug */ >> int fd; /* fd to mmap() region */ >> + bool post_wr; /* writes can be posted */ >> } VFIORegion; >> >> typedef struct VFIOMigration { >> @@ -180,7 +181,7 @@ struct VFIODeviceIO { >> int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t >> size, >> void *data); >> int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t >> size, >> - void *data); >> + void *data, bool post); >> }; >> >> struct VFIOContainerIO { >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index d26b325..de64e53 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -215,6 +215,7 @@ void vfio_region_write(void *opaque, hwaddr addr, >> uint32_t dword; >> uint64_t qword; >> } buf; >> + bool post = region->post_wr; >> int ret; >> >> switch (size) { >> @@ -235,12 +236,19 @@ void vfio_region_write(void *opaque, hwaddr addr, >> break; >> } >> >> - ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, >> &buf); >> + /* read-after-write hazard if guest can directly access region */ >> + if (region->nr_mmaps) { >> + post = false; >> + } >> + ret = vbasedev->io->region_write(vbasedev, region->nr, addr, size, &buf, >> + post); >> if (ret != size) { >> + const char *err = ret < 0 ? strerror(-ret) : "short write"; >> + >> error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 >> - ",%d) failed: %m", >> + ",%d) failed: %s", >> __func__, vbasedev->name, region->nr, >> - addr, data, size); >> + addr, data, size, err); >> } >> trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); >> >> @@ -271,9 +279,11 @@ uint64_t vfio_region_read(void *opaque, >> >> ret = vbasedev->io->region_read(vbasedev, region->nr, addr, size, &buf); >> if (ret != size) { >> - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", >> + const char *err = ret < 0 ? strerror(-ret) : "short read"; >> + >> + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", >> __func__, vbasedev->name, region->nr, >> - addr, size); >> + addr, size, err); >> return (uint64_t)-1; >> } >> >> @@ -1584,6 +1594,7 @@ int vfio_region_setup(Object *obj, VFIODevice >> *vbasedev, VFIORegion *region, >> region->size = info->size; >> region->fd_offset = info->offset; >> region->nr = index; >> + region->post_wr = false; >> if (vbasedev->regfds != NULL) { >> region->fd = vbasedev->regfds[index]; >> } else { >> @@ -2711,7 +2722,7 @@ static int vfio_io_region_read(VFIODevice *vbasedev, >> uint8_t index, off_t off, >> } >> >> static int vfio_io_region_write(VFIODevice *vbasedev, uint8_t index, off_t >> off, >> - uint32_t size, void *data) >> + uint32_t size, void *data, bool post) >> { > > This is all a bit confusing as a non-posted write on bare metal would > require a follow-up read to flush the write, but in the kernel case we > rely on both the bus protocols and the userspace driver to perform such > actions to enforce coherency. The read-after-write hazard comment above > suggests the issue is split access between mmap and read/write across > the region, where the mmap access bypasses the socket protocol. But > isn't this actually across the whole device? For example, PCI doesn't > care which BAR a write targets, reads to another BAR cannot bypass the > write, aiui. IOW, couldn't a write to a BAR that doesn't support mmap > affect the behavior of a BAR that does support mmap, and therefore > there should be no posted writes for the entire device if any region > supports mmap access? >
The protocol has sequential ordering, so issues arise only when reads to mapped regions pass writes that were sent asynchronously. The code is designed to handle common driver ordering operations: reading config space (which can’t be mapped, so they’re sequentially ordered by the protocol) and reading a nearby register (in the same region). There is a ’no-post’ vfio-user option in case the driver relies on ordering reads to other (non-config) regions I made this the default since of the the primary customers is an NVME device server that polls a mapped doorbell region and wants posted writes. > I wonder if a better approach wouldn't be to make all writes operations > synchronous and avoid read-after-write hazard by performing writes > through the mmap when available, ie. make use of the same bypass to > avoid the hazard. > I can flip the default to be strict PCI, and force the NVME folks to use an option to enable performance. > In any case, I think this deserves more comments, both why vfio-user > needs it and why vfio-kernel doesn't, and ideally the initial > read/write implementation would be entirely synchronous with posted > write support added as a separate patch with full explanation and > justification. Thanks, > Sure. Ordering is always a pain to reason about. JJ