[PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
This patch introduces support for PMR that has been defined as part of NVMe 1.4 spec. User can now specify a pmr_file which will be mmap'ed into qemu address space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes to the PMR region that will stay persistent accross system reboot. Signed-off-by: Andrzej Jakowski --- hw/block/nvme.c | 145 ++- hw/block/nvme.h | 5 ++ hw/block/trace-events | 5 ++ include/block/nvme.h | 172 ++ 4 files changed, 326 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d28335cbf3..836cf8d426 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -19,10 +19,14 @@ * -drive file=,if=none,id= * -device nvme,drive=,serial=,id=, \ * cmb_size_mb=, \ + * [pmr_file=,] \ * num_queues= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. + * + * Either cmb or pmr - due to limitation in avaialbe BAR indexes. + * pmr_file file needs to be preallocated and be multiple of MiB in size. */ #include "qemu/osdep.h" @@ -1141,6 +1145,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly, "invalid write to read only CMBSZ, ignored"); return; +case 0xE00: /* PMRCAP */ +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly, + "invalid write to PMRCAP register, ignored"); +return; +case 0xE04: /* TODO PMRCTL */ +break; +case 0xE08: /* PMRSTS */ +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly, + "invalid write to PMRSTS register, ignored"); +return; +case 0xE0C: /* PMREBS */ +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly, + "invalid write to PMREBS register, ignored"); +return; +case 0xE10: /* PMRSWTP */ +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly, + "invalid write to PMRSWTP register, ignored"); +return; +case 0xE14: /* TODO PMRMSC */ + break; default: NVME_GUEST_ERR(nvme_ub_mmiowr_invalid, "invalid MMIO write," @@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = { }, }; +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data, +unsigned size) +{ +NvmeCtrl *n = (NvmeCtrl *)opaque; +stn_le_p(&n->pmrbuf[addr], size, data); +} + +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size) +{ +NvmeCtrl *n = (NvmeCtrl *)opaque; +if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) { +int ret; +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); +if (!ret) { +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier, + "error while persisting data"); +} +} +return ldn_le_p(&n->pmrbuf[addr], size); +} + +static const MemoryRegionOps nvme_pmr_ops = { +.read = nvme_pmr_read, +.write = nvme_pmr_write, +.endianness = DEVICE_LITTLE_ENDIAN, +.impl = { +.min_access_size = 1, +.max_access_size = 8, +}, +}; + + static void nvme_realize(PCIDevice *pci_dev, Error **errp) { NvmeCtrl *n = NVME(pci_dev); @@ -1332,6 +1388,37 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) error_setg(errp, "serial property not set"); return; } + +if (!n->cmb_size_mb && n->pmr_file) { +int fd; + +n->f_pmr = fopen(n->pmr_file, "r+b"); +if (!n->f_pmr) { +error_setg(errp, "pmr backend file open error"); +return; +} + +fseek(n->f_pmr, 0L, SEEK_END); +n->f_pmr_size = ftell(n->f_pmr); +fseek(n->f_pmr, 0L, SEEK_SET); + +/* pmr file size needs to be multiple of MiB in size */ +if (!n->f_pmr_size || n->f_pmr_size % (1 << 20)) { +error_setg(errp, "pmr backend file size needs to be greater than 0" + "and multiple of MiB in size"); +return; +} + +fd = fileno(n->f_pmr); +n->pmrbuf = mmap(NULL, n->f_pmr_size, + (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0); + +if (n->pmrbuf == MAP_FAILED) { +error_setg(errp, "pmr backend file mmap error"); +return; +} +} + blkconf_blocksizes(&n->conf); if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk), false, errp)) { @@ -1393,7 +1480,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) n->bar.intmc = n->bar.intms = 0; if (n->cmb_size_mb) { - NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2); NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0); @@ -1415,6 +1501,52 @@ static void nvme_rea
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
Patchew URL: https://patchew.org/QEMU/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/ Hi, This series failed the docker-mingw@fedora build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #! /bin/bash export ARCH=x86_64 make docker-image-fedora V=1 NETWORK=1 time make docker-test-mingw@fedora J=14 NETWORK=1 === TEST SCRIPT END === CC hw/display/sii9022.o CC hw/display/ssd0303.o /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_pmr_read': /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: implicit declaration of function 'msync'; did you mean 'fsync'? [-Werror=implicit-function-declaration] ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); ^ fsync /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: nested extern declaration of 'msync' [-Werror=nested-externs] /tmp/qemu-test/src/hw/block/nvme.c:1342:47: error: 'MS_SYNC' undeclared (first use in this function) ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); ^~~ /tmp/qemu-test/src/hw/block/nvme.c:1342:47: note: each undeclared identifier is reported only once for each function it appears in /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_realize': /tmp/qemu-test/src/hw/block/nvme.c:1413:21: error: implicit declaration of function 'mmap'; did you mean 'max'? [-Werror=implicit-function-declaration] n->pmrbuf = mmap(NULL, n->f_pmr_size, ^~~~ max /tmp/qemu-test/src/hw/block/nvme.c:1413:21: error: nested extern declaration of 'mmap' [-Werror=nested-externs] /tmp/qemu-test/src/hw/block/nvme.c:1414:27: error: 'PROT_READ' undeclared (first use in this function); did you mean 'OF_READ'? (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0); ^ OF_READ /tmp/qemu-test/src/hw/block/nvme.c:1414:39: error: 'PROT_WRITE' undeclared (first use in this function); did you mean 'OF_WRITE'? (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0); ^~ OF_WRITE /tmp/qemu-test/src/hw/block/nvme.c:1414:52: error: 'MAP_SHARED' undeclared (first use in this function); did you mean 'RAM_SHARED'? (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0); ^~ RAM_SHARED /tmp/qemu-test/src/hw/block/nvme.c:1416:26: error: 'MAP_FAILED' undeclared (first use in this function); did you mean 'WAIT_FAILED'? if (n->pmrbuf == MAP_FAILED) { ^~ WAIT_FAILED /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_exit': /tmp/qemu-test/src/hw/block/nvme.c:1583:13: error: implicit declaration of function 'munmap' [-Werror=implicit-function-declaration] munmap(n->pmrbuf, n->f_pmr_size); ^~ /tmp/qemu-test/src/hw/block/nvme.c:1583:13: error: nested extern declaration of 'munmap' [-Werror=nested-externs] cc1: all warnings being treated as errors make: *** [/tmp/qemu-test/src/rules.mak:69: hw/block/nvme.o] Error 1 make: *** Waiting for unfinished jobs Traceback (most recent call last): File "./tests/docker/docker.py", line 664, in --- raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=058833b8d96e4c67a646a099f4118351', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-0kstbie3/src/docker-src.2020-02-18-20.04.28.2259:/var/tmp/qemu:z,ro', 'qemu:fedora', '/var/tmp/qemu/run', 'test-mingw']' returned non-zero exit status 2. filter=--filter=label=com.qemu.instance.uuid=058833b8d96e4c67a646a099f4118351 make[1]: *** [docker-run] Error 1 make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-0kstbie3/src' make: *** [docker-run-test-mingw@fedora] Error 2 real2m39.403s user0m8.170s The full log is available at http://patchew.org/logs/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/testing.docker-mingw@fedora/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On 2/18/20 6:07 PM, no-re...@patchew.org wrote: > === TEST SCRIPT BEGIN === > #! /bin/bash > export ARCH=x86_64 > make docker-image-fedora V=1 NETWORK=1 > time make docker-test-mingw@fedora J=14 NETWORK=1 > === TEST SCRIPT END === > > CC hw/display/sii9022.o > CC hw/display/ssd0303.o > /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_pmr_read': > /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: implicit declaration of > function 'msync'; did you mean 'fsync'? > [-Werror=implicit-function-declaration] > ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); >^ >fsync > /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: nested extern declaration > of 'msync' [-Werror=nested-externs] > /tmp/qemu-test/src/hw/block/nvme.c:1342:47: error: 'MS_SYNC' undeclared > (first use in this function) > ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); >^~~ > /tmp/qemu-test/src/hw/block/nvme.c:1342:47: note: each undeclared identifier > is reported only once for each function it appears in > /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_realize': > /tmp/qemu-test/src/hw/block/nvme.c:1413:21: error: implicit declaration of > function 'mmap'; did you mean 'max'? [-Werror=implicit-function-declaration] > n->pmrbuf = mmap(NULL, n->f_pmr_size, > ^~~~ This patch seems to fail on cross-compilation for Windows env. I plan to submit second version of this patch which will conditionally support PMR for Linux environment only. It should take care of this problem. Do you see any better fix for that?
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On Tue, Feb 18, 2020 at 03:48:11PM -0700, Andrzej Jakowski wrote: > This patch introduces support for PMR that has been defined as part of NVMe > 1.4 > spec. User can now specify a pmr_file which will be mmap'ed into qemu address > space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes > to the PMR region that will stay persistent accross system reboot. > > Signed-off-by: Andrzej Jakowski > --- > hw/block/nvme.c | 145 ++- > hw/block/nvme.h | 5 ++ > hw/block/trace-events | 5 ++ > include/block/nvme.h | 172 ++ > 4 files changed, 326 insertions(+), 1 deletion(-) NVDIMM folks, please take a look. There seems to be commonality here. Can this use -object memory-backend-file instead of manually opening and mapping a file? Also CCing David Gilbert because there is some similarity with the vhost-user-fs's DAX Window feature where QEMU mmaps regions of files into a BAR. > @@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = { > }, > }; > > +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data, > +unsigned size) > +{ > +NvmeCtrl *n = (NvmeCtrl *)opaque; > +stn_le_p(&n->pmrbuf[addr], size, data); > +} > + > +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size) > +{ > +NvmeCtrl *n = (NvmeCtrl *)opaque; > +if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) { > +int ret; > +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); > +if (!ret) { > +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier, > + "error while persisting data"); > +} > +} Why is msync(2) done on memory loads instead of stores? signature.asc Description: PGP signature
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On 2/21/20 6:45 AM, Stefan Hajnoczi wrote: > Why is msync(2) done on memory loads instead of stores? This is my interpretation of NVMe spec wording with regards to PMRWBM field which says: "The completion of a memory read from any Persistent Memory Region address ensures that all prior writes to the Persistent Memory Region have completed and are persistent."
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski wrote: > On 2/21/20 6:45 AM, Stefan Hajnoczi wrote: > > Why is msync(2) done on memory loads instead of stores? > > This is my interpretation of NVMe spec wording with regards to PMRWBM field > which says: > > "The completion of a memory read from any Persistent > Memory Region address ensures that all prior writes to the > Persistent Memory Region have completed and are > persistent." Thanks, I haven't read the PMR section of the spec :). A synchronous operation is bad for virtualization performance. While the sync may be a cheap operation in hardware, it can be arbitrarily expensive with msync(2). The vCPU will be stuck until msync(2) completes on the host. It's also a strange design choice since performance will suffer when an unrelated read has to wait for writes to complete. This is especially problematic for multi-threaded applications or multi-core systems where I guess this case is hit frequently. Maybe it's so cheap in hardware that it doesn't matter? But then why didn't NVDIMM use this mechanism? If anyone knows the answer I'd be interested in learning. But this isn't a criticism of the patch - of course it needs to implement the hardware spec and we can't change it. Stefan
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
* Stefan Hajnoczi (stefa...@gmail.com) wrote: > On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski > wrote: > > On 2/21/20 6:45 AM, Stefan Hajnoczi wrote: > > > Why is msync(2) done on memory loads instead of stores? > > > > This is my interpretation of NVMe spec wording with regards to PMRWBM field > > which says: > > > > "The completion of a memory read from any Persistent > > Memory Region address ensures that all prior writes to the > > Persistent Memory Region have completed and are > > persistent." > > Thanks, I haven't read the PMR section of the spec :). > > A synchronous operation is bad for virtualization performance. While > the sync may be a cheap operation in hardware, it can be arbitrarily > expensive with msync(2). The vCPU will be stuck until msync(2) > completes on the host. > > It's also a strange design choice since performance will suffer when > an unrelated read has to wait for writes to complete. This is > especially problematic for multi-threaded applications or multi-core > systems where I guess this case is hit frequently. Maybe it's so > cheap in hardware that it doesn't matter? But then why didn't NVDIMM > use this mechanism? > > If anyone knows the answer I'd be interested in learning. But this > isn't a criticism of the patch - of course it needs to implement the > hardware spec and we can't change it. Is this coming from the underlying PCIe spec ? In PCIe Base 4.0 Rev 0.3 Feb19-2014, section 2.4.1 Transaction ordering, there's a Table 2-39 and entry B2a in that table is: 'A Read Request must not pass a Posted Request unless B2b applies.' and a posted request is defined as a 'Memory Write Request or a Message Request'. ??? Dave > Stefan > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On Fri, Feb 21, 2020, 17:50 Dr. David Alan Gilbert wrote: > * Stefan Hajnoczi (stefa...@gmail.com) wrote: > > On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski > > wrote: > > > On 2/21/20 6:45 AM, Stefan Hajnoczi wrote: > > > > Why is msync(2) done on memory loads instead of stores? > > > > > > This is my interpretation of NVMe spec wording with regards to PMRWBM > field > > > which says: > > > > > > "The completion of a memory read from any Persistent > > > Memory Region address ensures that all prior writes to the > > > Persistent Memory Region have completed and are > > > persistent." > > > > Thanks, I haven't read the PMR section of the spec :). > > > > A synchronous operation is bad for virtualization performance. While > > the sync may be a cheap operation in hardware, it can be arbitrarily > > expensive with msync(2). The vCPU will be stuck until msync(2) > > completes on the host. > > > > It's also a strange design choice since performance will suffer when > > an unrelated read has to wait for writes to complete. This is > > especially problematic for multi-threaded applications or multi-core > > systems where I guess this case is hit frequently. Maybe it's so > > cheap in hardware that it doesn't matter? But then why didn't NVDIMM > > use this mechanism? > > > > If anyone knows the answer I'd be interested in learning. But this > > isn't a criticism of the patch - of course it needs to implement the > > hardware spec and we can't change it. > > Is this coming from the underlying PCIe spec ? > In PCIe Base 4.0 Rev 0.3 Feb19-2014, section 2.4.1 Transaction ordering, > there's a Table 2-39 and entry B2a in that table is: > > > 'A Read Request must not pass a Posted Request unless B2b applies.' > > and a posted request is defined as a 'Memory Write Request or a Message > Request'. > > ??? > No, that relates to transaction ordering in PCI, not data persistence in PMR. PMR can define whatever persistence semantics it wants. The completion of the write transaction at the PCI level does not mean data has to be persistent at the PMR level. Stefan >
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
* Stefan Hajnoczi (stefa...@gmail.com) wrote: > On Tue, Feb 18, 2020 at 03:48:11PM -0700, Andrzej Jakowski wrote: > > This patch introduces support for PMR that has been defined as part of NVMe > > 1.4 > > spec. User can now specify a pmr_file which will be mmap'ed into qemu > > address > > space and subsequently in PCI BAR 2. Guest OS can perform mmio read and > > writes > > to the PMR region that will stay persistent accross system reboot. > > > > Signed-off-by: Andrzej Jakowski > > --- > > hw/block/nvme.c | 145 ++- > > hw/block/nvme.h | 5 ++ > > hw/block/trace-events | 5 ++ > > include/block/nvme.h | 172 ++ > > 4 files changed, 326 insertions(+), 1 deletion(-) > > NVDIMM folks, please take a look. There seems to be commonality here. > > Can this use -object memory-backend-file instead of manually opening and > mapping a file? > > Also CCing David Gilbert because there is some similarity with the > vhost-user-fs's DAX Window feature where QEMU mmaps regions of files > into a BAR. I guess the biggest difference here is that the read can have the side effect; in my world I don't have to set read/write/endian ops - I just map a chunk of memory and use memory_region_add_subregion, so all the read/writes are native read/writes - I assume that would be a lot faster - I guess it depends if NVME_PMRCAP_PMRWBM is something constant you know early on; if you know that you don't need to do side effects in the read you could do the same trick and avoid the IO ops altogether. Isn't there also a requirement that BARs are powers of two? Wouldn't you need to ensure the PMR file is a power of 2 in size? Dave > > @@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = { > > }, > > }; > > > > +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data, > > +unsigned size) > > +{ > > +NvmeCtrl *n = (NvmeCtrl *)opaque; > > +stn_le_p(&n->pmrbuf[addr], size, data); > > +} > > + > > +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size) > > +{ > > +NvmeCtrl *n = (NvmeCtrl *)opaque; > > +if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) { > > +int ret; > > +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC); > > +if (!ret) { > > +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier, > > + "error while persisting data"); > > +} > > +} > > Why is msync(2) done on memory loads instead of stores? -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
Hi Andrzej, After having looked at the PMRWBM part of the spec, I think that the Bit 1 mode should be implemented for slightly better performance. Bit 0 mode is not well-suited to virtualization for the reasons I mentioned in the previous email. The spec describes Bit 1 mode as "The completion of a read to the PMRSTS register shall ensure that all prior writes to the Persistent Memory Region have completed and are persistent". Stefan
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On 2/21/20 11:45 AM, Dr. David Alan Gilbert wrote: > Isn't there also a requirement that BARs are powers of two? Wouldn't > you need to ensure the PMR file is a power of 2 in size? > > Dave Agree, need to make sure it is power of 2.
Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec
On 2/21/20 12:32 PM, Stefan Hajnoczi wrote: > Hi Andrzej, > After having looked at the PMRWBM part of the spec, I think that the > Bit 1 mode should be implemented for slightly better performance. Bit > 0 mode is not well-suited to virtualization for the reasons I > mentioned in the previous email. > > The spec describes Bit 1 mode as "The completion of a read to the > PMRSTS register shall ensure that all prior writes to the Persistent > Memory Region have completed and are persistent". > > Stefan Make sense -- will incorporate that feedback in second version of patch.