On 06/06/17 10:59 -0700, Dan Williams wrote: > On Tue, Jun 6, 2017 at 12:22 AM, Haozhong Zhang > <haozhong.zh...@intel.com> wrote: > > Applications in Linux guest that use device-dax never trigger flush > > that can be trapped by KVM/QEMU. Meanwhile, if the host backend is not > > device-dax, QEMU cannot guarantee the persistence of guest writes. > > Before solving this flushing problem, QEMU should warn users if the > > host backend is not device-dax. > > > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com> > > Message-id: > > capcyv4hv2-zw8smcrtd0p_86kgr3dhovne+6t5sy2u7wxg3...@mail.gmail.com > > --- > > hw/mem/nvdimm.c | 6 ++++++ > > include/qemu/osdep.h | 9 ++++++++ > > util/osdep.c | 61 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 76 insertions(+) > > > > diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c > > index a9b0863f20..b23542fbdf 100644 > > --- a/hw/mem/nvdimm.c > > +++ b/hw/mem/nvdimm.c > > @@ -26,6 +26,7 @@ > > #include "qapi/error.h" > > #include "qapi/visitor.h" > > #include "hw/mem/nvdimm.h" > > +#include "qemu/error-report.h" > > > > static void nvdimm_get_label_size(Object *obj, Visitor *v, const char > > *name, > > void *opaque, Error **errp) > > @@ -84,6 +85,11 @@ static void nvdimm_realize(PCDIMMDevice *dimm, Error > > **errp) > > NVDIMMDevice *nvdimm = NVDIMM(dimm); > > uint64_t align, pmem_size, size = memory_region_size(mr); > > > > + if (!qemu_fd_is_dev_dax(memory_region_get_fd(mr))) { > > + error_report("warning: nvdimm backend does not look like a DAX > > device, " > > + "unable to guarantee persistence of guest writes"); > > + } > > + > > align = memory_region_get_alignment(mr); > > > > pmem_size = size - nvdimm->label_size; > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > > index 1c9f5e260c..7f26af371e 100644 > > --- a/include/qemu/osdep.h > > +++ b/include/qemu/osdep.h > > @@ -470,4 +470,13 @@ char *qemu_get_pid_name(pid_t pid); > > */ > > pid_t qemu_fork(Error **errp); > > > > +/** > > + * qemu_fd_is_dev_dax: > > + * > > + * Check whether @fd describes a DAX device. > > + * > > + * Returns true if it is; otherwise, return false. > > + */ > > +bool qemu_fd_is_dev_dax(int fd); > > + > > #endif > > diff --git a/util/osdep.c b/util/osdep.c > > index a2863c8e53..02881f96bc 100644 > > --- a/util/osdep.c > > +++ b/util/osdep.c > > @@ -471,3 +471,64 @@ writev(int fd, const struct iovec *iov, int iov_cnt) > > return readv_writev(fd, iov, iov_cnt, true); > > } > > #endif > > + > > +#ifdef __linux__ > > +static ssize_t qemu_dev_dax_sysfs_read(int fd, const char *entry, > > + char *buf, size_t len) > > +{ > > + ssize_t read_bytes; > > + struct stat st; > > + unsigned int major, minor; > > + char *path, *pos; > > + int sysfs_fd; > > + > > + if (fstat(fd, &st)) { > > + return 0; > > + } > > + > > + major = major(st.st_rdev); > > + minor = minor(st.st_rdev); > > + path = g_strdup_printf("/sys/dev/char/%u:%u/%s", major, minor, entry); > > + > > + sysfs_fd = open(path, O_RDONLY); > > + g_free(path); > > + if (sysfs_fd == -1) { > > + return 0; > > + } > > + > > + read_bytes = read(sysfs_fd, buf, len - 1); > > + close(sysfs_fd); > > + if (read_bytes > 0) { > > + buf[read_bytes] = '\0'; > > + pos = g_strstr_len(buf, read_bytes, "\n"); > > + if (pos) { > > + *pos = '\0'; > > + } > > + } > > + > > + return read_bytes; > > +} > > +#endif /* __linux__ */ > > + > > +bool qemu_fd_is_dev_dax(int fd) > > +{ > > + bool is_dax = false; > > + > > +#ifdef __linux__ > > + char devtype[7]; > > + ssize_t len; > > + > > + if (fd == -1) { > > + return false; > > + } > > + > > + len = qemu_dev_dax_sysfs_read(fd, "device/devtype", > > + devtype, sizeof(devtype)); > > + if (len <= 0) { > > + return false; > > + } > > + is_dax = !strncmp(devtype, "nd_dax", len); > > There's no guarantee that device-dax instances are always parented by > an "nd_dax" device-type. A more reliable check is to see if > "/sys/dev/char/%u:%u/subsystem" points to "/sys/class/dax".
Thanks for pointing out this, will change in the next version. Haozhong