KVM device-assignment implementation doesn't provide any mechanism to 
report PCI errors (related to the assigned device) to the guest VM. 
Similarly, events like suspend and resume aren't reported. This is a 
limitation to achieve high availability in a system where VMs are 
controlling devices directly.

>From previous discussion, it's understood that VFIO is a great solution 
for kvm devices-assignment and ideally this work should be part of it. 
Unfortunately, a solution is needed till it gets more mature.

The first step at reporting events and errors all the way up to the guest 
kernel is to provide a mechanism for the host kernel to notify userspace. 
This patches propose a solution based on pci-stub and UIO. Other solutions 
exists but this one was choosen for it's simplicity and compatibility with 
current model.

All comments are welcome.
Warning: Minimal testing.

thanks,
-Etienne

Signed-off-by: Etienne Martineau <etmar...@cisco.com>
---
 drivers/uio/Kconfig          |   11 ++
 drivers/uio/Makefile         |    1 +
 drivers/uio/uio_pci_stub.c   |  359 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/Kbuild         |    1 +
 include/linux/uio_pci_stub.h |   31 ++++
 5 files changed, 403 insertions(+), 0 deletions(-)
 create mode 100644 drivers/uio/uio_pci_stub.c
 create mode 100644 include/linux/uio_pci_stub.h

diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index bb44079..e4af9d4 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -94,4 +94,15 @@ config UIO_NETX
          To compile this driver as a module, choose M here; the module
          will be called uio_netx.
 
+config UIO_PCI_STUB
+       tristate "Simple stub driver with AER capabilities"
+       depends on PCI
+       help
+         Say Y or M here if you want be able to reserve a PCI device
+         when it is going to be assigned to a guest operating system.
+         Also, this driver gives you the option to notify the guest
+         operating system in case where the device report an PCI error.
+
+         When in doubt, say N.
+
 endif
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index 18fd818..c1eeedc 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC)   += uio_aec.o
 obj-$(CONFIG_UIO_SERCOS3)      += uio_sercos3.o
 obj-$(CONFIG_UIO_PCI_GENERIC)  += uio_pci_generic.o
 obj-$(CONFIG_UIO_NETX) += uio_netx.o
+obj-$(CONFIG_UIO_PCI_STUB)     += uio_pci_stub.o
diff --git a/drivers/uio/uio_pci_stub.c b/drivers/uio/uio_pci_stub.c
new file mode 100644
index 0000000..18fadcb
--- /dev/null
+++ b/drivers/uio/uio_pci_stub.c
@@ -0,0 +1,359 @@
+/*
+ * uio_pci_stub.c - Simple stub driver with AER capabilities
+ *
+ * Copyright (C) 2010 Cisco Systems
+ * Author: Etienne Martineau <etmar...@cisco.com>
+ *
+ * Based on drivers/pci/pci-stub.c by Chris Wright,
+ * Copyright (C) 2008 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Usage is simple, allocate a new id to the uio_pci_stub driver and bind the
+ * device to it.  For example:
+ * 
+ * Since the driver does not declare any device ids, you must allocate
+ * id and bind the device to the driver yourself.  For example:
+ *
+ * # echo "8086 10f5" > /sys/bus/pci/drivers/uio_pci_stub/new_id
+ * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/e1000e/unbind
+ * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/uio_pci_stub/bind
+ * # ls -l /sys/bus/pci/devices/0000:00:19.0/driver
+ * .../0000:00:19.0/driver -> ../../../bus/pci/drivers/uio_pci_stub
+ *
+ * uio_pci_stub is equivalent to pci-stub when no extra parameter is 
+ * given to the module at load time. 'aer=1' will turn on PCIe AER error 
+ * reporting. 
+ *
+ * NOTE: There is no support for suspend and resume and current implementation
+ * is not based on eventfd.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/uio_driver.h>
+#include <linux/uio_pci_stub.h>
+
+static int debug=0;
+static int aer=0;
+static char ids[1024] __initdata;
+
+#define DRIVER_VERSION "0.01"
+#define DRIVER_AUTHOR  "Etienne Martineau <etmar...@cisco.com>"
+#define DRIVER_DESC    "Simple stub driver with AER capabilities"
+MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the stub driver, format is "
+                
"\"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\""
+                " and multiple comma separated entries can be specified");
+module_param_string(ids, ids, sizeof(ids), 0);
+MODULE_PARM_DESC(debug, "Debugging mode enabled or not");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(aer, "AER error reporting enabled or not");
+module_param(aer, bool, 0644);
+
+#define DPRINTK(fmt, args...)  \
+       do{     \
+               if(debug) \
+                       printk(KERN_DEBUG "%s: " fmt, __func__ , ## args); \
+} while (0)
+
+struct uio_pci_stub_priv {
+       atomic_t sync;
+       pci_ers_result_t result;
+       struct semaphore sem;
+       char name[UIO_MAX_NAME_SIZE];
+};
+
+/*
+ * For every pci error handlers invoked, userspace is notified. It has
+ * access to the pci error code through 'logical BAR0.
+ *
+ * After each notification, Kernel will wait for user space to provide
+ * the pci error result. Upon timeout, kernel takes default action.
+ *
+ * Most is not all UIO drivers typically used 'value' to control the state of
+ * an interrupt in the interrupt controller. Here, 'value' transport 
+ * the pci error result.
+ */
+static int uio_pci_stub_control(struct uio_info *info, s32 value)
+{
+       struct uio_pci_stub_priv *priv = info->priv;
+       enum pci_error_result result=value;
+       pci_ers_result_t pci_result;
+
+       /* Sanity check */
+       switch(result){
+               case RESULT_NONE:
+                       pci_result = PCI_ERS_RESULT_NONE;
+               break;
+               case RESULT_CAN_RECOVER:
+                       pci_result = PCI_ERS_RESULT_CAN_RECOVER;
+               break;
+               case RESULT_NEED_RESET:
+                       pci_result = PCI_ERS_RESULT_NEED_RESET;
+               break;
+               case RESULT_DISCONNECT:
+                       pci_result = PCI_ERS_RESULT_DISCONNECT;
+               break;
+               case RESULT_RECOVERED:
+                       pci_result = PCI_ERS_RESULT_RECOVERED;
+               break;
+               default:
+                       return -EINVAL;
+       }
+
+       if(atomic_inc_and_test(&priv->sync)){
+               priv->result = pci_result;
+               up(&priv->sem);
+               return 0;
+       }
+       /* Userspace is out of sync */ 
+       return -EPIPE;
+}
+
+static int logical_bar_setup(struct uio_info *info, int n)
+{
+       void *ptr;
+
+       ptr = (void*)__get_free_pages(GFP_KERNEL,0);
+       if(!ptr)
+               return -ENOMEM;
+
+       info->mem[n].addr = virt_to_phys(ptr);
+       info->mem[n].size = PAGE_SIZE;
+       info->mem[n].memtype = UIO_MEM_LOGICAL;
+       info->mem[n].internal_addr = ptr;
+       return 0;
+}
+
+static void logical_bar_release(struct uio_info *info, int n)
+{
+       if(info->mem[n].internal_addr)
+               free_pages((long unsigned int)info->mem[n].internal_addr,0);
+}
+
+static int __devinit probe(struct pci_dev *dev,
+                          const struct pci_device_id *id)
+{
+       int ret = -ENODEV;
+       struct uio_info *info;
+       struct uio_pci_stub_priv *priv;
+
+       info = kzalloc(sizeof(struct uio_info), GFP_KERNEL);
+       if (!info){
+               ret = -ENOMEM;
+               goto bad;
+       }
+
+       priv = kzalloc(sizeof(struct uio_pci_stub_priv), GFP_KERNEL);
+       if (!priv){
+               ret = -ENOMEM;
+               goto bad1;
+       }
+
+       ret = logical_bar_setup(info, 0);
+       if(ret)
+               goto bad2;
+
+       info->priv = priv;
+       info->version = DRIVER_VERSION;
+       info->irqcontrol = uio_pci_stub_control;
+       info->irq = UIO_IRQ_CUSTOM;
+
+       snprintf(priv->name, UIO_MAX_NAME_SIZE,
+               FORMAT_UIO_DEV_NAME(dev->bus->number, PCI_SLOT(dev->devfn), 
+               PCI_FUNC(dev->devfn), id->vendor, id->device));
+       info->name = priv->name;
+
+       init_MUTEX_LOCKED(&priv->sem);
+       atomic_set(&priv->sync, 0);
+       pci_set_drvdata(dev, info);
+
+       ret = uio_register_device(&dev->dev, info);
+       if(ret)
+               goto bad3;
+       
+       dev_printk(KERN_INFO, &dev->dev, "claimed by uio_pci_stub\n");
+       return 0;
+
+bad3:
+       logical_bar_release(info, 0);
+bad2:
+       kfree(priv);
+bad1:
+       kfree(info);
+bad:
+       return ret;
+}
+
+static void remove(struct pci_dev *dev)
+{
+       struct uio_info *info = pci_get_drvdata(dev);
+
+       uio_unregister_device(info);
+       pci_set_drvdata(dev, NULL);
+       logical_bar_release(info, 0);
+       kfree(info->priv);
+       kfree(info);
+}
+
+/* ------------------ PCI Error Recovery infrastructure  -------------- */
+static int notify_user(enum pci_error_code err_code, struct pci_dev *pdev)
+{
+       int err;
+       struct uio_info *info = pci_get_drvdata(pdev);
+       struct uio_pci_stub_priv *priv = info->priv;
+       struct uio_pci_stub_logical_bar *bar = info->mem[0].internal_addr;  
+
+       DPRINTK("AER error code %d",err_code);
+
+       if(err_code == RESUME){/* No reply expected */
+               bar->err_code = err_code;
+               uio_event_notify(info);
+               return 0;
+       }
+
+       /* Notify user space */
+       atomic_set(&priv->sync, -1);
+       bar->err_code = err_code;
+       uio_event_notify(info);
+
+       /* Wait till userspace post on the semaphore. Arbitrary timeout... */
+       err = down_timeout(&priv->sem, msecs_to_jiffies(50));
+       if(!err){
+               DPRINTK("AER result code %d",priv->result);
+               return priv->result;
+       }
+       
+       /* userspace post on the semaphore sometime after the timeout occurs */
+       if(!atomic_inc_and_test(&priv->sync))
+               down(&priv->sem);
+       
+       printk(KERN_INFO "AER userspace not responding");
+       return PCI_ERS_RESULT_NONE;
+}
+
+/**
+ * error_detected - called when PCI error is detected.
+ * @pdev: Pointer to PCI device
+ * @state: The current pci connection state
+ */
+static pci_ers_result_t error_detected(struct pci_dev *pdev, 
pci_channel_state_t state)
+{
+       return notify_user(ERROR_DETECTED, pdev);
+}
+
+/**
+ * mmio_enabled
+ * MMIO has been re-enabled, but not DMA 
+ */
+static pci_ers_result_t mmio_enabled(struct pci_dev *pdev)
+{
+       return notify_user(MMIO_ENABLED, pdev);
+}
+
+/**
+ * link_reset
+ * PCI Express link has been reset 
+ */
+static pci_ers_result_t link_reset(struct pci_dev *pdev)
+{
+       return notify_user(LINK_RESET, pdev);
+}
+
+/**
+ * slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch.
+ */
+static pci_ers_result_t slot_reset(struct pci_dev *pdev)
+{
+       return notify_user(SLOT_RESET, pdev);
+}
+
+/**
+ * resume - resume normal operations
+ * @pdev: Pointer to PCI device
+ *
+ * Resume normal operations after an error recovery
+ * sequence has been completed.
+ */
+static void resume(struct pci_dev *pdev)
+{
+       notify_user(RESUME, pdev);
+}
+
+static struct pci_error_handlers err_handler = {
+       .error_detected = error_detected,
+       .mmio_enabled = mmio_enabled,
+       .link_reset = link_reset,
+       .slot_reset = slot_reset,
+       .resume = resume,
+};
+
+static struct pci_driver driver = {
+       .name = "uio_pci_stub",
+       .id_table = NULL, /* only dynamic id's */
+       .probe = probe,
+       .remove = remove,
+};
+
+static int __init init(void)
+{
+       char *p, *id;
+       int rc;
+
+       pr_info(DRIVER_DESC " %s" " version: " DRIVER_VERSION "\n",
+               aer?"Turned on":"Turned off");
+
+       if(aer)
+               driver.err_handler = &err_handler;
+
+       rc = pci_register_driver(&driver);
+       if (rc)
+               return rc;
+
+       /* add ids specified in the module parameter */
+       p = ids;
+       while ((id = strsep(&p, ","))) {
+               unsigned int vendor, device, subvendor = PCI_ANY_ID,
+                       subdevice = PCI_ANY_ID, class=0, class_mask=0;
+               int fields;
+
+               fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
+                               &vendor, &device, &subvendor, &subdevice,
+                               &class, &class_mask);
+
+               if (fields < 2) {
+                       printk(KERN_WARNING
+                              "pci-stub: invalid id string \"%s\"\n", id);
+                       continue;
+               }
+
+               printk(KERN_INFO
+                      "pci-stub: add %04X:%04X sub=%04X:%04X cls=%08X/%08X\n",
+                      vendor, device, subvendor, subdevice, class, class_mask);
+
+               rc = pci_add_dynid(&driver, vendor, device,
+                                  subvendor, subdevice, class, class_mask, 0);
+               if (rc)
+                       printk(KERN_WARNING
+                              "pci-stub: failed to add dynamic id (%d)\n", rc);
+       }
+
+       return 0;
+}
+
+static void __exit cleanup(void)
+{
+       pci_unregister_driver(&driver);
+}
+
+module_init(init);
+module_exit(cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 831c463..045a5de 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -362,6 +362,7 @@ header-y += udf_fs_i.h
 header-y += udp.h
 header-y += uinput.h
 header-y += uio.h
+header-y += uio_pci_stub.h
 header-y += ultrasound.h
 header-y += un.h
 header-y += unistd.h
diff --git a/include/linux/uio_pci_stub.h b/include/linux/uio_pci_stub.h
new file mode 100644
index 0000000..873c407
--- /dev/null
+++ b/include/linux/uio_pci_stub.h
@@ -0,0 +1,31 @@
+#ifndef __LINUX_UIO_PCI_STUB_H
+#define __LINUX_UIO_PCI_STUB_H
+
+#ifndef UIO_MAX_NAME_SIZE
+#define UIO_MAX_NAME_SIZE 64
+#endif
+
+#define FORMAT_UIO_DEV_NAME(vendorid,deviceid,busnr,dev,fcn)\
+       "%x:%x.%x  %x:%x",vendorid,deviceid,busnr,dev,fcn
+
+enum pci_error_code{
+       ERROR_DETECTED,
+       MMIO_ENABLED,
+       LINK_RESET,
+       SLOT_RESET,
+       RESUME,
+};
+
+enum pci_error_result{
+       RESULT_NONE,
+       RESULT_CAN_RECOVER,
+       RESULT_NEED_RESET,
+       RESULT_DISCONNECT,
+       RESULT_RECOVERED,
+};
+
+struct uio_pci_stub_logical_bar {
+       enum pci_error_code err_code;
+};
+
+#endif
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to