This patch adds virtio-pmem driver for KVM guest.
Guest reads the persistent memory range information from
Qemu over VIRTIO and registers it on nvdimm_bus. It also
creates a nd_region object with the persistent memory
range information so that existing 'nvdimm/pmem' driver
can reserve this into system memory map. This way
'virtio-pmem' driver uses existing functionality of pmem
driver to register persistent memory compatible for DAX
capable filesystems.
This also provides function to perform guest flush over
VIRTIO from 'pmem' driver when userspace performs flush
on DAX memory range.
Signed-off-by: Pankaj Gupta
Reviewed-by: Yuval Shaia
Acked-by: Michael S. Tsirkin
Acked-by: Jakub Staron
Tested-by: Jakub Staron
---
drivers/nvdimm/Makefile | 1 +
drivers/nvdimm/nd_virtio.c | 124 +++
drivers/nvdimm/virtio_pmem.c | 122 ++
drivers/nvdimm/virtio_pmem.h | 55 ++
drivers/virtio/Kconfig | 11 +++
include/uapi/linux/virtio_ids.h | 1 +
include/uapi/linux/virtio_pmem.h | 35 +
7 files changed, 349 insertions(+)
create mode 100644 drivers/nvdimm/nd_virtio.c
create mode 100644 drivers/nvdimm/virtio_pmem.c
create mode 100644 drivers/nvdimm/virtio_pmem.h
create mode 100644 include/uapi/linux/virtio_pmem.h
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 6f2a088afad6..cefe233e0b52 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o
obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_OF_PMEM) += of_pmem.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
nd_pmem-y := pmem.o
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
new file mode 100644
index ..efc535723517
--- /dev/null
+++ b/drivers/nvdimm/nd_virtio.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+ /* The interrupt handler */
+void host_ack(struct virtqueue *vq)
+{
+ struct virtio_pmem *vpmem = vq->vdev->priv;
+ struct virtio_pmem_request *req_data, *req_buf;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(>pmem_lock, flags);
+ while ((req_data = virtqueue_get_buf(vq, )) != NULL) {
+ req_data->done = true;
+ wake_up(_data->host_acked);
+
+ if (!list_empty(>req_list)) {
+ req_buf = list_first_entry(>req_list,
+ struct virtio_pmem_request, list);
+ req_buf->wq_buf_avail = true;
+ wake_up(_buf->wq_buf);
+ list_del(_buf->list);
+ }
+ }
+ spin_unlock_irqrestore(>pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(host_ack);
+
+ /* The request submission function */
+int virtio_pmem_flush(struct nd_region *nd_region)
+{
+ struct virtio_device *vdev = nd_region->provider_data;
+ struct virtio_pmem *vpmem = vdev->priv;
+ struct virtio_pmem_request *req_data;
+ struct scatterlist *sgs[2], sg, ret;
+ unsigned long flags;
+ int err, err1;
+
+ might_sleep();
+ req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+ if (!req_data)
+ return -ENOMEM;
+
+ req_data->done = false;
+ init_waitqueue_head(_data->host_acked);
+ init_waitqueue_head(_data->wq_buf);
+ INIT_LIST_HEAD(_data->list);
+ req_data->req.type = cpu_to_virtio32(vdev, VIRTIO_PMEM_REQ_TYPE_FLUSH);
+ sg_init_one(, _data->req, sizeof(req_data->req));
+ sgs[0] =
+ sg_init_one(, _data->resp.ret, sizeof(req_data->resp));
+ sgs[1] =
+
+ spin_lock_irqsave(>pmem_lock, flags);
+/*
+ * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+ * queue does not have free descriptor. We add the request
+ * to req_list and wait for host_ack to wake us up when free
+ * slots are available.
+ */
+ while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data,
+ GFP_ATOMIC)) == -ENOSPC) {
+
+ dev_err(>dev, "failed to send command to virtio pmem
device, no free slots in the virtqueue\n");
+ req_data->wq_buf_avail = false;
+ list_add_tail(_data->list, >req_list);
+ spin_unlock_irqrestore(>pmem_lock, flags);
+
+ /* A host response results in "host_ack" getting called */
+ wait_event(req_data->wq_buf, req_data->wq_buf_avail);
+ spin_lock_irqsave(>pmem_lock, flags);
+ }
+ err1 = virtqueue_kick(vpmem->req_vq);
+ spin_unlock_irqrestore(>pmem_lock, flags);
+