[patch 20/22] PCI/MSI: Make pci_msi_domain_check_cap() static

2021-11-26 Thread Thomas Gleixner
No users outside of that file.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |5 +++--
 include/linux/msi.h |2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -79,8 +79,9 @@ static inline bool pci_msi_desc_is_multi
  *  1 if Multi MSI is requested, but the domain does not support it
  *  -ENOTSUPP otherwise
  */
-int pci_msi_domain_check_cap(struct irq_domain *domain,
-struct msi_domain_info *info, struct device *dev)
+static int pci_msi_domain_check_cap(struct irq_domain *domain,
+   struct msi_domain_info *info,
+   struct device *dev)
 {
struct msi_desc *desc = first_pci_msi_entry(to_pci_dev(dev));
 
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -438,8 +438,6 @@ void *platform_msi_get_host_data(struct
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 struct msi_domain_info *info,
 struct irq_domain *parent);
-int pci_msi_domain_check_cap(struct irq_domain *domain,
-struct msi_domain_info *info, struct device *dev);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev 
*pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
 bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);




[patch 21/22] genirq/msi: Handle PCI/MSI allocation fail in core code

2021-11-26 Thread Thomas Gleixner
Get rid of yet another irqdomain callback and let the core code return the
already available information of how many descriptors could be allocated.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |   13 -
 include/linux/msi.h |5 +
 kernel/irq/msi.c|   29 +
 3 files changed, 26 insertions(+), 21 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -95,16 +95,6 @@ static int pci_msi_domain_check_cap(stru
return 0;
 }
 
-static int pci_msi_domain_handle_error(struct irq_domain *domain,
-  struct msi_desc *desc, int error)
-{
-   /* Special handling to support __pci_enable_msi_range() */
-   if (pci_msi_desc_is_multi_msi(desc) && error == -ENOSPC)
-   return 1;
-
-   return error;
-}
-
 static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
struct msi_desc *desc)
 {
@@ -115,7 +105,6 @@ static void pci_msi_domain_set_desc(msi_
 static struct msi_domain_ops pci_msi_domain_ops_default = {
.set_desc   = pci_msi_domain_set_desc,
.msi_check  = pci_msi_domain_check_cap,
-   .handle_error   = pci_msi_domain_handle_error,
 };
 
 static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -129,8 +118,6 @@ static void pci_msi_domain_update_dom_op
ops->set_desc = pci_msi_domain_set_desc;
if (ops->msi_check == NULL)
ops->msi_check = pci_msi_domain_check_cap;
-   if (ops->handle_error == NULL)
-   ops->handle_error = pci_msi_domain_handle_error;
}
 }
 
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -285,7 +285,6 @@ struct msi_domain_info;
  * @msi_check: Callback for verification of the domain/info/dev data
  * @msi_prepare:   Prepare the allocation of the interrupts in the domain
  * @set_desc:  Set the msi descriptor for an interrupt
- * @handle_error:  Optional error handler if the allocation fails
  * @domain_alloc_irqs: Optional function to override the default allocation
  * function.
  * @domain_free_irqs:  Optional function to override the default free
@@ -294,7 +293,7 @@ struct msi_domain_info;
  * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying
  * irqdomain.
  *
- * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by
+ * @msi_check, @msi_prepare and @set_desc are callbacks used by
  * msi_domain_alloc/free_irqs().
  *
  * @domain_alloc_irqs, @domain_free_irqs can be used to override the
@@ -331,8 +330,6 @@ struct msi_domain_ops {
   msi_alloc_info_t *arg);
void(*set_desc)(msi_alloc_info_t *arg,
struct msi_desc *desc);
-   int (*handle_error)(struct irq_domain *domain,
-   struct msi_desc *desc, int error);
int (*domain_alloc_irqs)(struct irq_domain *domain,
 struct device *dev, int nvec);
void(*domain_free_irqs)(struct irq_domain *domain,
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -538,6 +538,27 @@ static bool msi_check_reservation_mode(s
return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
 }
 
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc 
*desc,
+  int allocated)
+{
+   switch(domain->bus_token) {
+   case DOMAIN_BUS_PCI_MSI:
+   case DOMAIN_BUS_VMD_MSI:
+   if (IS_ENABLED(CONFIG_PCI_MSI))
+   break;
+   fallthrough;
+   default:
+   return -ENOSPC;
+   }
+
+   /* Let a failed PCI multi MSI allocation retry */
+   if (desc->nvec_used > 1)
+   return 1;
+
+   /* If there was a successful allocation let the caller know */
+   return allocated ? allocated : -ENOSPC;
+}
+
 int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
int nvec)
 {
@@ -546,6 +567,7 @@ int __msi_domain_alloc_irqs(struct irq_d
struct irq_data *irq_data;
struct msi_desc *desc;
msi_alloc_info_t arg = { };
+   int allocated = 0;
int i, ret, virq;
bool can_reserve;
 
@@ -560,16 +582,15 @@ int __msi_domain_alloc_irqs(struct irq_d
   dev_to_node(dev), &arg, false,
   desc->affinity);
if (virq < 0) {
-   ret = -ENOSPC;
-   if (ops->handle_error)
-   ret = ops->handle_error(domain, desc, ret);
-   return ret;
+   ret = msi_handle_pci_fail(domain, desc, allocated);
+

[patch 15/22] PCI/MSI: Move code into a separate directory

2021-11-26 Thread Thomas Gleixner
msi.c is getting larger and really could do with a splitup. Move it into
it's own directory to prepare for that.

Signed-off-by: Thomas Gleixner 
---
 Documentation/driver-api/pci/pci.rst |2 
 drivers/pci/Makefile |3 
 drivers/pci/msi.c| 1532 ---
 drivers/pci/msi/Makefile |4 
 drivers/pci/msi/msi.c| 1532 +++
 5 files changed, 1539 insertions(+), 1534 deletions(-)

--- a/Documentation/driver-api/pci/pci.rst
+++ b/Documentation/driver-api/pci/pci.rst
@@ -13,7 +13,7 @@ PCI Support Library
 .. kernel-doc:: drivers/pci/search.c
:export:
 
-.. kernel-doc:: drivers/pci/msi.c
+.. kernel-doc:: drivers/pci/msi/msi.c
:export:
 
 .. kernel-doc:: drivers/pci/bus.c
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -5,8 +5,9 @@
 obj-$(CONFIG_PCI)  += access.o bus.o probe.o host-bridge.o \
   remove.o pci.o pci-driver.o search.o \
   pci-sysfs.o rom.o setup-res.o irq.o vpd.o \
-  setup-bus.o vc.o mmap.o setup-irq.o msi.o
+  setup-bus.o vc.o mmap.o setup-irq.o
 
+obj-$(CONFIG_PCI)  += msi/
 obj-$(CONFIG_PCI)  += pcie/
 
 ifdef CONFIG_PCI
--- a/drivers/pci/msi.c
+++ /dev/null
@@ -1,1532 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PCI Message Signaled Interrupt (MSI)
- *
- * Copyright (C) 2003-2004 Intel
- * Copyright (C) Tom Long Nguyen (tom.l.ngu...@intel.com)
- * Copyright (C) 2016 Christoph Hellwig.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include "pci.h"
-
-#ifdef CONFIG_PCI_MSI
-
-static int pci_msi_enable = 1;
-int pci_msi_ignore_mask;
-
-#define msix_table_size(flags) ((flags & PCI_MSIX_FLAGS_QSIZE) + 1)
-
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
-static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-   struct irq_domain *domain;
-
-   domain = dev_get_msi_domain(&dev->dev);
-   if (domain && irq_domain_is_hierarchy(domain))
-   return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
-
-   return arch_setup_msi_irqs(dev, nvec, type);
-}
-
-static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
-{
-   struct irq_domain *domain;
-
-   domain = dev_get_msi_domain(&dev->dev);
-   if (domain && irq_domain_is_hierarchy(domain))
-   msi_domain_free_irqs(domain, &dev->dev);
-   else
-   arch_teardown_msi_irqs(dev);
-}
-#else
-#define pci_msi_setup_msi_irqs arch_setup_msi_irqs
-#define pci_msi_teardown_msi_irqs  arch_teardown_msi_irqs
-#endif
-
-#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
-/* Arch hooks */
-int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
-   return -EINVAL;
-}
-
-void __weak arch_teardown_msi_irq(unsigned int irq)
-{
-}
-
-int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-   struct msi_desc *entry;
-   int ret;
-
-   /*
-* If an architecture wants to support multiple MSI, it needs to
-* override arch_setup_msi_irqs()
-*/
-   if (type == PCI_CAP_ID_MSI && nvec > 1)
-   return 1;
-
-   for_each_pci_msi_entry(entry, dev) {
-   ret = arch_setup_msi_irq(dev, entry);
-   if (ret < 0)
-   return ret;
-   if (ret > 0)
-   return -ENOSPC;
-   }
-
-   return 0;
-}
-
-void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-   int i;
-   struct msi_desc *entry;
-
-   for_each_pci_msi_entry(entry, dev)
-   if (entry->irq)
-   for (i = 0; i < entry->nvec_used; i++)
-   arch_teardown_msi_irq(entry->irq + i);
-}
-#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
-
-/*
- * PCI 2.3 does not specify mask bits for each MSI interrupt.  Attempting to
- * mask all MSI interrupts by clearing the MSI enable bit does not work
- * reliably as devices without an INTx disable bit will then generate a
- * level IRQ which will never be cleared.
- */
-static inline __attribute_const__ u32 msi_multi_mask(struct msi_desc *desc)
-{
-   /* Don't shift by >= width of type */
-   if (desc->pci.msi_attrib.multi_cap >= 5)
-   return 0x;
-   return (1 << (1 << desc->pci.msi_attrib.multi_cap)) - 1;
-}
-
-static noinline void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 
set)
-{
-   raw_spinlock_t *lock = &desc->dev->msi_lock;
-   unsigned long flags;
-
-   if (!desc->pci.msi_attrib.can_mask)
-   return;
-
-   raw_spin_lock_irqsave(lock, flags);
-   desc->pci.msi_mask &= ~clear;
-   desc->pci.msi_mask |= set;
-   pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
-  desc->pci.msi_mask);
-

[patch 17/22] PCI/MSI: Split out !IRQDOMAIN code

2021-11-26 Thread Thomas Gleixner
Split out the non irqdomain code into its own file.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/Makefile |5 ++--
 drivers/pci/msi/legacy.c |   51 +++
 drivers/pci/msi/msi.c|   46 --
 3 files changed, 54 insertions(+), 48 deletions(-)

--- a/drivers/pci/msi/Makefile
+++ b/drivers/pci/msi/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 #
 # Makefile for the PCI/MSI
-obj-$(CONFIG_PCI)  += pcidev_msi.o
-obj-$(CONFIG_PCI_MSI)  += msi.o
+obj-$(CONFIG_PCI)  += pcidev_msi.o
+obj-$(CONFIG_PCI_MSI)  += msi.o
+obj-$(CONFIG_PCI_MSI_ARCH_FALLBACKS)   += legacy.o
--- /dev/null
+++ b/drivers/pci/msi/legacy.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Message Signaled Interrupt (MSI).
+ *
+ * Legacy architecture specific setup and teardown mechanism.
+ */
+#include "msi.h"
+
+/* Arch hooks */
+int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+   return -EINVAL;
+}
+
+void __weak arch_teardown_msi_irq(unsigned int irq)
+{
+}
+
+int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+   struct msi_desc *desc;
+   int ret;
+
+   /*
+* If an architecture wants to support multiple MSI, it needs to
+* override arch_setup_msi_irqs()
+*/
+   if (type == PCI_CAP_ID_MSI && nvec > 1)
+   return 1;
+
+   for_each_pci_msi_entry(desc, dev) {
+   ret = arch_setup_msi_irq(dev, desc);
+   if (ret)
+   return ret < 0 ? ret : -ENOSPC;
+   }
+
+   return 0;
+}
+
+void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+   struct msi_desc *desc;
+   int i;
+
+   for_each_pci_msi_entry(desc, dev) {
+   if (desc->irq) {
+   for (i = 0; i < entry->nvec_used; i++)
+   arch_teardown_msi_irq(desc->irq + i);
+   }
+   }
+}
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -50,52 +50,6 @@ static void pci_msi_teardown_msi_irqs(st
 #define pci_msi_teardown_msi_irqs  arch_teardown_msi_irqs
 #endif
 
-#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
-/* Arch hooks */
-int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
-   return -EINVAL;
-}
-
-void __weak arch_teardown_msi_irq(unsigned int irq)
-{
-}
-
-int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-   struct msi_desc *entry;
-   int ret;
-
-   /*
-* If an architecture wants to support multiple MSI, it needs to
-* override arch_setup_msi_irqs()
-*/
-   if (type == PCI_CAP_ID_MSI && nvec > 1)
-   return 1;
-
-   for_each_pci_msi_entry(entry, dev) {
-   ret = arch_setup_msi_irq(dev, entry);
-   if (ret < 0)
-   return ret;
-   if (ret > 0)
-   return -ENOSPC;
-   }
-
-   return 0;
-}
-
-void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-   int i;
-   struct msi_desc *entry;
-
-   for_each_pci_msi_entry(entry, dev)
-   if (entry->irq)
-   for (i = 0; i < entry->nvec_used; i++)
-   arch_teardown_msi_irq(entry->irq + i);
-}
-#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
-
 /*
  * PCI 2.3 does not specify mask bits for each MSI interrupt.  Attempting to
  * mask all MSI interrupts by clearing the MSI enable bit does not work




[patch 19/22] PCI/MSI: Sanitize MSIX table map handling

2021-11-26 Thread Thomas Gleixner
Unmapping the MSIX base mapping in the loops which allocate/free MSI
desciptors is daft and in the way of allowing runtime expansion of MSI-X
descriptors.

Store the mapping in struct pci_dev and free it after freeing the MSI-X
descriptors.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/msi.c |   18 --
 include/linux/pci.h   |1 +
 2 files changed, 9 insertions(+), 10 deletions(-)

--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -241,14 +241,14 @@ static void free_msi_irqs(struct pci_dev
pci_msi_teardown_msi_irqs(dev);
 
list_for_each_entry_safe(entry, tmp, msi_list, list) {
-   if (entry->pci.msi_attrib.is_msix) {
-   if (list_is_last(&entry->list, msi_list))
-   iounmap(entry->pci.mask_base);
-   }
-
list_del(&entry->list);
free_msi_entry(entry);
}
+
+   if (dev->msix_base) {
+   iounmap(dev->msix_base);
+   dev->msix_base = NULL;
+   }
 }
 
 static void pci_intx_for_msi(struct pci_dev *dev, int enable)
@@ -501,10 +501,6 @@ static int msix_setup_entries(struct pci
for (i = 0, curmsk = masks; i < nvec; i++) {
entry = alloc_msi_entry(&dev->dev, 1, curmsk);
if (!entry) {
-   if (!i)
-   iounmap(base);
-   else
-   free_msi_irqs(dev);
/* No enough memory. Don't try again */
ret = -ENOMEM;
goto out;
@@ -602,12 +598,14 @@ static int msix_capability_init(struct p
goto out_disable;
}
 
+   dev->msix_base = base;
+
/* Ensure that all table entries are masked. */
msix_mask_all(base, tsize);
 
ret = msix_setup_entries(dev, base, entries, nvec, affd);
if (ret)
-   goto out_disable;
+   goto out_free;
 
ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
if (ret)
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -473,6 +473,7 @@ struct pci_dev {
u8  ptm_granularity;
 #endif
 #ifdef CONFIG_PCI_MSI
+   void __iomem*msix_base;
const struct attribute_group **msi_irq_groups;
 #endif
struct pci_vpd  vpd;




[patch 18/22] PCI/MSI: Split out irqdomain code

2021-11-26 Thread Thomas Gleixner
Move the irqdomain specific code into it's own file.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/Makefile|1 
 drivers/pci/msi/irqdomain.c |  279 ++
 drivers/pci/msi/legacy.c|   10 +
 drivers/pci/msi/msi.c   |  319 +---
 drivers/pci/msi/msi.h   |   39 +
 include/linux/msi.h |   11 -
 6 files changed, 339 insertions(+), 320 deletions(-)

--- a/drivers/pci/msi/Makefile
+++ b/drivers/pci/msi/Makefile
@@ -3,4 +3,5 @@
 # Makefile for the PCI/MSI
 obj-$(CONFIG_PCI)  += pcidev_msi.o
 obj-$(CONFIG_PCI_MSI)  += msi.o
+obj-$(CONFIG_PCI_MSI_IRQ_DOMAIN)   += irqdomain.o
 obj-$(CONFIG_PCI_MSI_ARCH_FALLBACKS)   += legacy.o
--- /dev/null
+++ b/drivers/pci/msi/irqdomain.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Message Signaled Interrupt (MSI) - irqdomain support
+ */
+#include 
+#include 
+#include 
+
+#include "msi.h"
+
+int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+   struct irq_domain *domain;
+
+   domain = dev_get_msi_domain(&dev->dev);
+   if (domain && irq_domain_is_hierarchy(domain))
+   return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
+
+   return pci_msi_legacy_setup_msi_irqs(dev, nvec, type);
+}
+
+void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
+{
+   struct irq_domain *domain;
+
+   domain = dev_get_msi_domain(&dev->dev);
+   if (domain && irq_domain_is_hierarchy(domain))
+   msi_domain_free_irqs(domain, &dev->dev);
+   else
+   pci_msi_legacy_teardown_msi_irqs(dev);
+}
+
+/**
+ * pci_msi_domain_write_msg - Helper to write MSI message to PCI config space
+ * @irq_data:  Pointer to interrupt data of the MSI interrupt
+ * @msg:   Pointer to the message
+ */
+static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg 
*msg)
+{
+   struct msi_desc *desc = irq_data_get_msi_desc(irq_data);
+
+   /*
+* For MSI-X desc->irq is always equal to irq_data->irq. For
+* MSI only the first interrupt of MULTI MSI passes the test.
+*/
+   if (desc->irq == irq_data->irq)
+   __pci_write_msi_msg(desc, msg);
+}
+
+/**
+ * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
+ * @desc:  Pointer to the MSI descriptor
+ *
+ * The ID number is only used within the irqdomain.
+ */
+static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc)
+{
+   struct pci_dev *dev = msi_desc_to_pci_dev(desc);
+
+   return (irq_hw_number_t)desc->pci.msi_attrib.entry_nr |
+   pci_dev_id(dev) << 11 |
+   (pci_domain_nr(dev->bus) & 0x) << 27;
+}
+
+static inline bool pci_msi_desc_is_multi_msi(struct msi_desc *desc)
+{
+   return !desc->pci.msi_attrib.is_msix && desc->nvec_used > 1;
+}
+
+/**
+ * pci_msi_domain_check_cap - Verify that @domain supports the capabilities
+ *   for @dev
+ * @domain:The interrupt domain to check
+ * @info:  The domain info for verification
+ * @dev:   The device to check
+ *
+ * Returns:
+ *  0 if the functionality is supported
+ *  1 if Multi MSI is requested, but the domain does not support it
+ *  -ENOTSUPP otherwise
+ */
+int pci_msi_domain_check_cap(struct irq_domain *domain,
+struct msi_domain_info *info, struct device *dev)
+{
+   struct msi_desc *desc = first_pci_msi_entry(to_pci_dev(dev));
+
+   /* Special handling to support __pci_enable_msi_range() */
+   if (pci_msi_desc_is_multi_msi(desc) &&
+   !(info->flags & MSI_FLAG_MULTI_PCI_MSI))
+   return 1;
+   else if (desc->pci.msi_attrib.is_msix && !(info->flags & 
MSI_FLAG_PCI_MSIX))
+   return -ENOTSUPP;
+
+   return 0;
+}
+
+static int pci_msi_domain_handle_error(struct irq_domain *domain,
+  struct msi_desc *desc, int error)
+{
+   /* Special handling to support __pci_enable_msi_range() */
+   if (pci_msi_desc_is_multi_msi(desc) && error == -ENOSPC)
+   return 1;
+
+   return error;
+}
+
+static void pci_msi_domain_set_desc(msi_alloc_info_t *arg,
+   struct msi_desc *desc)
+{
+   arg->desc = desc;
+   arg->hwirq = pci_msi_domain_calc_hwirq(desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops_default = {
+   .set_desc   = pci_msi_domain_set_desc,
+   .msi_check  = pci_msi_domain_check_cap,
+   .handle_error   = pci_msi_domain_handle_error,
+};
+
+static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info)
+{
+   struct msi_domain_ops *ops = info->ops;
+
+   if (ops == NULL) {
+   info->ops = &pci_msi_domain_ops_default;
+   } else {
+   if (ops->set_desc == NULL)
+   ops->set_desc = pci_msi_domain_set_desc;
+   if (ops-

[patch 22/22] PCI/MSI: Move descriptor counting on allocation fail to the legacy code

2021-11-26 Thread Thomas Gleixner
The irqdomain code already returns the information. Move the loop to the
legacy code.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/legacy.c |   20 +++-
 drivers/pci/msi/msi.c|   19 +--
 2 files changed, 20 insertions(+), 19 deletions(-)

--- a/drivers/pci/msi/legacy.c
+++ b/drivers/pci/msi/legacy.c
@@ -50,9 +50,27 @@ void __weak arch_teardown_msi_irqs(struc
}
 }
 
+static int pci_msi_setup_check_result(struct pci_dev *dev, int type, int ret)
+{
+   struct msi_desc *entry;
+   int avail = 0;
+
+   if (type != PCI_CAP_ID_MSIX || ret >= 0)
+   return ret;
+
+   /* Scan the MSI descriptors for successfully allocated ones. */
+   for_each_pci_msi_entry(entry, dev) {
+   if (entry->irq != 0)
+   avail++;
+   }
+   return avail ? avail : ret;
+}
+
 int pci_msi_legacy_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-   return arch_setup_msi_irqs(dev, nvec, type);
+   int ret = arch_setup_msi_irqs(dev, nvec, type);
+
+   return pci_msi_setup_check_result(dev, type, ret);
 }
 
 void pci_msi_legacy_teardown_msi_irqs(struct pci_dev *dev)
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -609,7 +609,7 @@ static int msix_capability_init(struct p
 
ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
if (ret)
-   goto out_avail;
+   goto out_free;
 
/* Check if all MSI entries honor device restrictions */
ret = msi_verify_entries(dev);
@@ -634,23 +634,6 @@ static int msix_capability_init(struct p
pcibios_free_irq(dev);
return 0;
 
-out_avail:
-   if (ret < 0) {
-   /*
-* If we had some success, report the number of IRQs
-* we succeeded in setting up.
-*/
-   struct msi_desc *entry;
-   int avail = 0;
-
-   for_each_pci_msi_entry(entry, dev) {
-   if (entry->irq != 0)
-   avail++;
-   }
-   if (avail != 0)
-   ret = avail;
-   }
-
 out_free:
free_msi_irqs(dev);
 




[patch 10/10] x86/apic/msi: Support MSI-X vector expansion

2021-11-26 Thread Thomas Gleixner
The X86 PCI/MSI irq domaim implementation supports vector expansion out of
the box. Make it available.

Signed-off-by: Thomas Gleixner 
---
 arch/x86/kernel/apic/msi.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -178,7 +178,7 @@ static struct msi_domain_ops pci_msi_dom
 
 static struct msi_domain_info pci_msi_domain_info = {
.flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
+ MSI_FLAG_PCI_MSIX | MSI_FLAG_CAN_EXPAND,
.ops= &pci_msi_domain_ops,
.chip   = &pci_msi_controller,
.handler= handle_edge_irq,
@@ -226,7 +226,7 @@ static struct irq_chip pci_msi_ir_contro
 
 static struct msi_domain_info pci_msi_ir_domain_info = {
.flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | 
MSI_FLAG_CAN_EXPAND,
.ops= &pci_msi_domain_ops,
.chip   = &pci_msi_ir_controller,
.handler= handle_edge_irq,




[patch 09/10] PCI/MSI: Provide pci_msix_expand_vectors[_at]()

2021-11-26 Thread Thomas Gleixner
Provide a new interface which allows to expand the MSI-X vector space if
the underlying irq domain implementation supports it.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/msi.c |   41 +
 include/linux/pci.h   |   13 +
 2 files changed, 54 insertions(+)

--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -1025,6 +1025,47 @@ int pci_alloc_irq_vectors_affinity(struc
 EXPORT_SYMBOL(pci_alloc_irq_vectors_affinity);
 
 /**
+ * pci_msix_expand_vectors_at - Expand MSI-X interrupts for a device
+ *
+ * @dev:   PCI device to operate on
+ * @at:Allocate at MSI-X index. If @at == PCI_MSI_EXPAND_AUTO
+ * the function expands automatically after the last
+ * active index.
+ * @nvec:  Number of vectors to allocate
+ *
+ * Expand the MSI-X vectors of a device after an initial enablement and
+ * allocation.
+ *
+ * Return: 0 if the allocation was successful, an error code otherwise.
+ */
+int pci_msix_expand_vectors_at(struct pci_dev *dev, unsigned int at, unsigned 
int nvec)
+{
+   struct msi_device_data *md = dev->dev.msi.data;
+   struct msi_range range = { .ndesc = nvec, };
+   unsigned int max_vecs;
+   int ret;
+
+   if (!pci_msi_enable || !dev || !dev->msix_enabled || !md)
+   return -ENOTSUPP;
+
+   if (!pci_msi_domain_supports_expand(dev))
+   return -ENOTSUPP;
+
+   max_vecs = pci_msix_vec_count(dev);
+   if (!nvec || nvec > max_vecs)
+   return -EINVAL;
+
+   range.first = at == PCI_MSIX_EXPAND_AUTO ? md->num_descs : at;
+
+   if (range.first >= max_vecs || nvec > max_vecs - range.first)
+   return -ENOSPC;
+
+   ret = msix_setup_interrupts(dev, dev->msix_base, &range, NULL, NULL, 
true);
+   return ret <= 0 ? ret : -ENOSPC;;
+}
+EXPORT_SYMBOL_GPL(pci_msix_expand_vectors_at);
+
+/**
  * pci_free_irq_vectors - free previously allocated IRQs for a device
  * @dev:   PCI device to operate on
  *
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1534,6 +1534,7 @@ static inline int pci_enable_msix_exact(
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
   unsigned int max_vecs, unsigned int flags,
   struct irq_affinity *affd);
+int pci_msix_expand_vectors_at(struct pci_dev *dev, unsigned int at, unsigned 
int nvec);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
@@ -1565,6 +1566,11 @@ pci_alloc_irq_vectors_affinity(struct pc
return -ENOSPC;
 }
 
+static inline int pci_msix_expand_vectors_at(struct pci_dev *dev, unsigned int 
at, unsigned int nvec)
+{
+   return -ENOTSUPP;
+}
+
 static inline void pci_free_irq_vectors(struct pci_dev *dev)
 {
 }
@@ -1582,6 +1588,13 @@ static inline const struct cpumask *pci_
 }
 #endif
 
+#define PCI_MSIX_EXPAND_AUTO   (UINT_MAX)
+
+static inline int pci_msix_expand_vectors(struct pci_dev *dev, unsigned int 
nvec)
+{
+   return pci_msix_expand_vectors_at(dev, PCI_MSIX_EXPAND_AUTO, nvec);
+}
+
 /**
  * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq
  * @d: the INTx IRQ domain




[patch 08/10] PCI/MSI: Provide pci_msi_domain_supports_expand()

2021-11-26 Thread Thomas Gleixner
Not all irq domain implementations can support runtime MSI-X vector
expansion as they assume zero based allocations or have other
restrictions.

The legacy PCI allocation functions are not suited for runtime vector
expansion either.

Add a function which allows to query whether runtime MSI-X vector expansion
is supported or not.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |   29 +++--
 include/linux/msi.h |2 ++
 2 files changed, 25 insertions(+), 6 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -8,12 +8,18 @@
 
 #include "msi.h"
 
+static struct irq_domain *pci_get_msi_domain(struct pci_dev *dev)
+{
+   struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
+
+   return domain && irq_domain_is_hierarchy(domain) ? domain : NULL;
+}
+
 int pci_msi_setup_msi_irqs(struct pci_dev *dev, struct msi_range *range, int 
type)
 {
-   struct irq_domain *domain;
+   struct irq_domain *domain = pci_get_msi_domain(dev);
 
-   domain = dev_get_msi_domain(&dev->dev);
-   if (domain && irq_domain_is_hierarchy(domain))
+   if (domain)
return msi_domain_alloc_irqs_descs_locked(domain, &dev->dev, 
range);
 
return pci_msi_legacy_setup_msi_irqs(dev, range->ndesc, type);
@@ -21,15 +27,26 @@ int pci_msi_setup_msi_irqs(struct pci_de
 
 void pci_msi_teardown_msi_irqs(struct pci_dev *dev, struct msi_range *range)
 {
-   struct irq_domain *domain;
+   struct irq_domain *domain = pci_get_msi_domain(dev);
 
-   domain = dev_get_msi_domain(&dev->dev);
-   if (domain && irq_domain_is_hierarchy(domain))
+   if (domain)
msi_domain_free_irqs_descs_locked(domain, &dev->dev, range);
else
pci_msi_legacy_teardown_msi_irqs(dev);
 }
 
+bool pci_msi_domain_supports_expand(struct pci_dev *dev)
+{
+   struct irq_domain *domain = pci_get_msi_domain(dev);
+   struct msi_domain_info *info;
+
+   if (!domain)
+   return false;
+
+   info = domain->host_data;
+   return info->flags & MSI_FLAG_CAN_EXPAND;
+}
+
 /**
  * pci_msi_domain_write_msg - Helper to write MSI message to PCI config space
  * @irq_data:  Pointer to interrupt data of the MSI interrupt
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -552,11 +552,13 @@ struct irq_domain *pci_msi_create_irq_do
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev 
*pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
 bool pci_dev_has_special_msi_domain(struct pci_dev *pdev);
+bool pci_msi_domain_supports_expand(struct pci_dev *dev);
 #else
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev 
*pdev)
 {
return NULL;
 }
+static inline bool pci_msi_domain_supports_expand(struct pci_dev *dev) { 
return false; }
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
 
 #endif /* LINUX_MSI_H */




[patch 07/10] PCI/MSI: Make free related functions range based

2021-11-26 Thread Thomas Gleixner
In preparation of runtime expandable PCI/MSI-X vectors convert the related
free functions to take ranges instead of assuming a zero based vector
space.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |5 ++---
 drivers/pci/msi/msi.c   |   24 
 drivers/pci/msi/msi.h   |2 +-
 3 files changed, 19 insertions(+), 12 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -19,14 +19,13 @@ int pci_msi_setup_msi_irqs(struct pci_de
return pci_msi_legacy_setup_msi_irqs(dev, range->ndesc, type);
 }
 
-void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
+void pci_msi_teardown_msi_irqs(struct pci_dev *dev, struct msi_range *range)
 {
-   struct msi_range range = { .first = 0, .last = UINT_MAX, };
struct irq_domain *domain;
 
domain = dev_get_msi_domain(&dev->dev);
if (domain && irq_domain_is_hierarchy(domain))
-   msi_domain_free_irqs_descs_locked(domain, &dev->dev, &range);
+   msi_domain_free_irqs_descs_locked(domain, &dev->dev, range);
else
pci_msi_legacy_teardown_msi_irqs(dev);
 }
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -222,9 +222,12 @@ void pci_write_msi_msg(unsigned int irq,
 }
 EXPORT_SYMBOL_GPL(pci_write_msi_msg);
 
-static void free_msi_irqs(struct pci_dev *dev)
+static void free_msi_irqs(struct pci_dev *dev, struct msi_range *range, bool 
shutdown)
 {
-   pci_msi_teardown_msi_irqs(dev);
+   pci_msi_teardown_msi_irqs(dev, range);
+
+   if (!shutdown)
+   return;
 
if (dev->msix_base) {
iounmap(dev->msix_base);
@@ -443,7 +446,7 @@ static int msi_capability_init(struct pc
 
 err:
pci_msi_unmask(entry, msi_multi_mask(entry));
-   free_msi_irqs(dev);
+   free_msi_irqs(dev, &range, true);
 unlock:
msi_unlock_descs(&dev->dev);
kfree(masks);
@@ -538,7 +541,7 @@ static void msix_mask_all(void __iomem *
 
 static int msix_setup_interrupts(struct pci_dev *dev, void __iomem *base,
 struct msi_range *range, struct msix_entry 
*entries,
-struct irq_affinity *affd)
+struct irq_affinity *affd, bool expand)
 {
struct irq_affinity_desc *masks = NULL;
int ret;
@@ -566,7 +569,8 @@ static int msix_setup_interrupts(struct
goto out_unlock;
 
 out_free:
-   free_msi_irqs(dev);
+   free_msi_irqs(dev, range, !expand);
+
 out_unlock:
msi_unlock_descs(&dev->dev);
kfree(masks);
@@ -614,7 +618,7 @@ static int msix_capability_init(struct p
/* Ensure that all table entries are masked. */
msix_mask_all(base, tsize);
 
-   ret = msix_setup_interrupts(dev, base, &range, entries, affd);
+   ret = msix_setup_interrupts(dev, base, &range, entries, affd, false);
if (ret)
goto out_disable;
 
@@ -728,12 +732,14 @@ static void pci_msi_shutdown(struct pci_
 
 void pci_disable_msi(struct pci_dev *dev)
 {
+   struct msi_range range = { .first = 0, .last = 0, };
+
if (!pci_msi_enable || !dev || !dev->msi_enabled)
return;
 
msi_lock_descs(&dev->dev);
pci_msi_shutdown(dev);
-   free_msi_irqs(dev);
+   free_msi_irqs(dev, &range, true);
msi_unlock_descs(&dev->dev);
 }
 EXPORT_SYMBOL(pci_disable_msi);
@@ -817,12 +823,14 @@ static void pci_msix_shutdown(struct pci
 
 void pci_disable_msix(struct pci_dev *dev)
 {
+   struct msi_range range = { .first = 0, .last = UINT_MAX, };
+
if (!pci_msi_enable || !dev || !dev->msix_enabled)
return;
 
msi_lock_descs(&dev->dev);
pci_msix_shutdown(dev);
-   free_msi_irqs(dev);
+   free_msi_irqs(dev, &range, true);
msi_unlock_descs(&dev->dev);
 }
 EXPORT_SYMBOL(pci_disable_msix);
--- a/drivers/pci/msi/msi.h
+++ b/drivers/pci/msi/msi.h
@@ -6,7 +6,7 @@
 #define msix_table_size(flags) ((flags & PCI_MSIX_FLAGS_QSIZE) + 1)
 
 extern int pci_msi_setup_msi_irqs(struct pci_dev *dev, struct msi_range 
*range, int type);
-extern void pci_msi_teardown_msi_irqs(struct pci_dev *dev);
+extern void pci_msi_teardown_msi_irqs(struct pci_dev *dev, struct msi_range 
*range);
 
 #ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
 extern int pci_msi_legacy_setup_msi_irqs(struct pci_dev *dev, int nvec, int 
type);




[patch 06/10] PCI/MSI: Use range in allocation path

2021-11-26 Thread Thomas Gleixner
Make the allocation path range based to prepare for runtime expansion of
MSI-X vectors.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |7 +++
 drivers/pci/msi/msi.c   |   34 +-
 drivers/pci/msi/msi.h   |2 +-
 3 files changed, 25 insertions(+), 18 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -8,16 +8,15 @@
 
 #include "msi.h"
 
-int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int pci_msi_setup_msi_irqs(struct pci_dev *dev, struct msi_range *range, int 
type)
 {
-   struct msi_range range = { .first = 0, .last = UINT_MAX, .ndesc = nvec};
struct irq_domain *domain;
 
domain = dev_get_msi_domain(&dev->dev);
if (domain && irq_domain_is_hierarchy(domain))
-   return msi_domain_alloc_irqs_descs_locked(domain, &dev->dev, 
&range);
+   return msi_domain_alloc_irqs_descs_locked(domain, &dev->dev, 
range);
 
-   return pci_msi_legacy_setup_msi_irqs(dev, nvec, type);
+   return pci_msi_legacy_setup_msi_irqs(dev, range->ndesc, type);
 }
 
 void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -370,14 +370,16 @@ static int msi_setup_msi_desc(struct pci
return ret;
 }
 
-static int msi_verify_entries(struct pci_dev *dev)
+static int msi_verify_entries(struct pci_dev *dev, struct msi_range *range)
 {
struct msi_desc *entry;
 
if (!dev->no_64bit_msi)
return 0;
 
-   msi_for_each_desc(entry, &dev->dev, MSI_DESC_ALL) {
+   msi_for_each_desc_from(entry, &dev->dev, MSI_DESC_ALL, range->first) {
+   if (entry->msi_index > range->last)
+   return 0;
if (entry->msg.address_hi) {
pci_err(dev, "arch assigned 64-bit MSI address %#x%08x 
but device only supports 32 bits\n",
entry->msg.address_hi, entry->msg.address_lo);
@@ -402,6 +404,7 @@ static int msi_verify_entries(struct pci
 static int msi_capability_init(struct pci_dev *dev, int nvec,
   struct irq_affinity *affd)
 {
+   struct msi_range range = { .first = 0, .last = 0, .ndesc = nvec, };
struct irq_affinity_desc *masks = NULL;
struct msi_desc *entry;
int ret;
@@ -421,11 +424,11 @@ static int msi_capability_init(struct pc
pci_msi_mask(entry, msi_multi_mask(entry));
 
/* Configure MSI capability structure */
-   ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
+   ret = pci_msi_setup_msi_irqs(dev, &range, PCI_CAP_ID_MSI);
if (ret)
goto err;
 
-   ret = msi_verify_entries(dev);
+   ret = msi_verify_entries(dev, &range);
if (ret)
goto err;
 
@@ -469,7 +472,8 @@ static void __iomem *msix_map_region(str
 }
 
 static int msix_setup_msi_descs(struct pci_dev *dev, void __iomem *base,
-   struct msix_entry *entries, int nvec,
+   struct msi_range *range,
+   struct msix_entry *entries,
struct irq_affinity_desc *masks)
 {
int ret, i, vec_count = pci_msix_vec_count(dev);
@@ -485,8 +489,8 @@ static int msix_setup_msi_descs(struct p
desc.pci.msi_attrib.default_irq = dev->irq;
desc.pci.mask_base  = base;
 
-   for (i = 0, curmsk = masks; i < nvec; i++, curmsk++) {
-   desc.msi_index = entries ? entries[i].entry : i;
+   for (i = 0, curmsk = masks; i < range->ndesc; i++, curmsk++) {
+   desc.msi_index = entries ? entries[i].entry : range->first + i;
desc.affinity = masks ? curmsk : NULL;
desc.pci.msi_attrib.is_virtual = desc.msi_index >= vec_count;
desc.pci.msi_attrib.can_mask = !pci_msi_ignore_mask &&
@@ -500,6 +504,9 @@ static int msix_setup_msi_descs(struct p
ret = msi_add_msi_desc(&dev->dev, &desc);
if (ret)
break;
+
+   if (desc.msi_index > range->last)
+   range->last = desc.msi_index;
}
 
return ret;
@@ -530,28 +537,28 @@ static void msix_mask_all(void __iomem *
 }
 
 static int msix_setup_interrupts(struct pci_dev *dev, void __iomem *base,
-struct msix_entry *entries, int nvec,
+struct msi_range *range, struct msix_entry 
*entries,
 struct irq_affinity *affd)
 {
struct irq_affinity_desc *masks = NULL;
int ret;
 
if (affd)
-   masks = irq_create_affinity_masks(nvec, affd);
+   masks = irq_create_affinity_masks(range->ndesc, affd);
 
msi_lock_descs(&dev->dev);
-   ret = msix_setup_msi_descs(dev, base, entries, nvec, masks);
+   ret = msix_setup_msi_descs(dev, base, 

[patch 05/10] genirq/msi: Add domain info flag MSI_FLAG_CAN_EXPAND

2021-11-26 Thread Thomas Gleixner
Not all MSI domains support runtime expansions of PCI/MSI-X vectors. Add a
domain flag so implementations can opt in.

Signed-off-by: Thomas Gleixner 
---
 include/linux/msi.h |2 ++
 1 file changed, 2 insertions(+)

--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -494,6 +494,8 @@ enum {
MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9),
/* Free MSI descriptors */
MSI_FLAG_FREE_MSI_DESCS = (1 << 10),
+   /* MSI vectors can be expanded after initial setup */
+   MSI_FLAG_CAN_EXPAND = (1 << 11),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,




[patch 04/10] genirq/msi: Prepare MSI domain alloc/free for range irq allocation

2021-11-26 Thread Thomas Gleixner
Make the iterators in the allocation and free functions range based.

Signed-off-by: Thomas Gleixner 
---
 kernel/irq/msi.c |   12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -877,6 +877,7 @@ int __msi_domain_alloc_irqs(struct irq_d
msi_alloc_info_t arg = { };
unsigned int vflags = 0;
struct msi_desc *desc;
+   unsigned long idx;
int allocated = 0;
int i, ret, virq;
 
@@ -906,7 +907,10 @@ int __msi_domain_alloc_irqs(struct irq_d
vflags |= VIRQ_NOMASK_QUIRK;
}
 
-   msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
+   xa_for_each_range(&dev->msi.data->store, idx, desc, range->first, 
range->last) {
+   if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+   continue;
+
ops->set_desc(&arg, desc);
 
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -999,10 +1003,14 @@ void __msi_domain_free_irqs(struct irq_d
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+   unsigned long idx;
int i;
 
/* Only handle MSI entries which have an interrupt associated */
-   msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+   xa_for_each_range(&dev->msi.data->store, idx, desc, range->first, 
range->last) {
+   if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+   continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);




[patch 03/10] genirq/msi: Make MSI descriptor alloc/free ready for range allocations

2021-11-26 Thread Thomas Gleixner
Convert the MSI descriptor related functions to ranges and fixup the call
sites.

Signed-off-by: Thomas Gleixner 
---
 drivers/base/platform-msi.c |3 ++-
 include/linux/msi.h |7 ---
 kernel/irq/msi.c|   38 +++---
 3 files changed, 25 insertions(+), 23 deletions(-)

--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -320,11 +320,12 @@ struct irq_domain *
 void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int 
virq,
 unsigned int nr_irqs)
 {
+   struct msi_range range = { .first = virq, .last = virq + nr_irqs - 1, };
struct platform_msi_priv_data *data = domain->host_data;
 
msi_lock_descs(data->dev);
irq_domain_free_irqs_common(domain, virq, nr_irqs);
-   msi_free_msi_descs_range(data->dev, MSI_DESC_ALL, virq, nr_irqs);
+   msi_free_msi_descs_range(data->dev, MSI_DESC_ALL, &range);
msi_unlock_descs(data->dev);
 }
 
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -321,8 +321,7 @@ static inline void pci_write_msi_msg(uns
 #endif /* CONFIG_PCI_MSI */
 
 int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc);
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int base_index, unsigned int ndesc);
+void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, 
struct msi_range *range);
 
 /**
  * msi_free_msi_descs - Free MSI descriptors of a device
@@ -330,7 +329,9 @@ void msi_free_msi_descs_range(struct dev
  */
 static inline void msi_free_msi_descs(struct device *dev)
 {
-   msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, UINT_MAX);
+   struct msi_range range = { .first = 0, .last = UINT_MAX, };
+
+   msi_free_msi_descs_range(dev, MSI_DESC_ALL, &range);
 }
 
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -101,19 +101,19 @@ int msi_add_msi_desc(struct device *dev,
  *
  * Return: 0 on success or an appropriate failure code.
  */
-static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, 
unsigned int ndesc)
+static int msi_add_simple_msi_descs(struct device *dev, struct msi_range 
*range)
 {
struct msi_desc *desc;
-   unsigned long i;
+   unsigned long idx;
int ret;
 
lockdep_assert_held(&dev->msi.data->mutex);
 
-   for (i = 0; i < ndesc; i++) {
+   for (idx = range->first; idx <= range->last; idx++) {
desc = msi_alloc_desc(dev, 1, NULL);
if (!desc)
goto fail_mem;
-   ret = msi_insert_desc(dev->msi.data, desc, index + i);
+   ret = msi_insert_desc(dev->msi.data, desc, idx);
if (ret)
goto fail;
}
@@ -122,7 +122,7 @@ static int msi_add_simple_msi_descs(stru
 fail_mem:
ret = -ENOMEM;
 fail:
-   msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, ndesc);
+   msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, range);
return ret;
 }
 
@@ -148,14 +148,14 @@ static bool msi_desc_match(struct msi_de
  * @ndesc: Number of descriptors to free
  */
 void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int base_index, unsigned int ndesc)
+ struct msi_range *range)
 {
struct msi_desc *desc;
unsigned long idx;
 
lockdep_assert_held(&dev->msi.data->mutex);
 
-   xa_for_each_range(&dev->msi.data->store, idx, desc, base_index, 
base_index + ndesc - 1) {
+   xa_for_each_range(&dev->msi.data->store, idx, desc, range->first, 
range->last) {
if (msi_desc_match(desc, filter)) {
xa_erase(&dev->msi.data->store, idx);
msi_free_desc(desc);
@@ -746,17 +746,18 @@ int msi_domain_prepare_irqs(struct irq_d
 int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 int virq_base, int nvec, msi_alloc_info_t *arg)
 {
+   struct msi_range range = { .first = virq_base, .last = virq_base + nvec 
- 1 };
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct msi_desc *desc;
int ret, virq;
 
msi_lock_descs(dev);
-   ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+   ret = msi_add_simple_msi_descs(dev, &range);
if (ret)
goto unlock;
 
-   for (virq = virq_base; virq < virq_base + nvec; virq++) {
+   for (virq = range.first; virq <= range.last; virq++) {
desc = xa_load(&dev->msi.data->store, virq);
desc->irq = virq;
 
@@ -773,7 +774,7 @@ int msi_domain_populate_irqs(struct irq_
 fail:
for (--virq; virq >= virq_base; virq--)
irq_domain_free_irq

[patch 01/10] genirq/msi: Add range argument to alloc/free MSI domain ops

2021-11-26 Thread Thomas Gleixner
In preparation for supporting range allocations for MSI-X, add a range
argument to the MSI domain alloc/free function pointers and fixup all
affected places.

The range is supplied via a pointer to a struct msi_range which contains
the first and last MSI index and the number of vectors to allocate/free.

To support the sparse MSI-X allocations via pci_enable_msix_range() and
pci_enable_msix_exact() the number of vectors can be smaller than the range
defined by the first and last MSI index. This can be cleaned up later once
the code is converted by converting these sparse allocations to an initial
allocation on enable and expansion of the vector space at the required
indices.

Signed-off-by: Thomas Gleixner 
---
 arch/powerpc/platforms/pseries/msi.c |6 +++---
 arch/x86/pci/xen.c   |   10 +-
 include/linux/msi.h  |   30 +++---
 kernel/irq/msi.c |   12 ++--
 4 files changed, 37 insertions(+), 21 deletions(-)

--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -450,13 +450,13 @@ static void pseries_msi_ops_msi_free(str
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
  */
-static void pseries_msi_domain_free_irqs(struct irq_domain *domain,
-struct device *dev)
+static void pseries_msi_domain_free_irqs(struct irq_domain *domain, struct 
device *dev,
+struct msi_range *range)
 {
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return;
 
-   __msi_domain_free_irqs(domain, dev);
+   __msi_domain_free_irqs(domain, dev, range);
 
rtas_disable_msi(to_pci_dev(dev));
 }
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -407,8 +407,8 @@ static void xen_pv_teardown_msi_irqs(str
xen_teardown_msi_irqs(dev);
 }
 
-static int xen_msi_domain_alloc_irqs(struct irq_domain *domain,
-struct device *dev,  int nvec)
+static int xen_msi_domain_alloc_irqs(struct irq_domain *domain, struct device 
*dev,
+struct msi_range *range)
 {
int type;
 
@@ -420,11 +420,11 @@ static int xen_msi_domain_alloc_irqs(str
else
type = PCI_CAP_ID_MSI;
 
-   return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type);
+   return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), range->ndesc, type);
 }
 
-static void xen_msi_domain_free_irqs(struct irq_domain *domain,
-struct device *dev)
+static void xen_msi_domain_free_irqs(struct irq_domain *domain, struct device 
*dev,
+struct msi_range *range)
 {
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return;
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -191,6 +191,23 @@ struct msi_device_data {
enum msi_desc_filter__iter_filter;
 };
 
+/**
+ * msi_range - Descriptor for a MSI index range
+ * @first: First index
+ * @last:  Last index (inclusive)
+ * @ndesc: Number of descriptors for allocations
+ *
+ * @first = 0 and @last = UINT_MAX is the full range for an operation.
+ *
+ * Note: @ndesc can be less than the range defined by @first and @last to
+ * support sparse allocations from PCI/MSI-X.
+ */
+struct msi_range {
+   unsigned intfirst;
+   unsigned intlast;
+   unsigned intndesc;
+};
+
 int msi_setup_device_data(struct device *dev);
 
 /* MSI device properties */
@@ -415,10 +432,10 @@ struct msi_domain_ops {
   msi_alloc_info_t *arg);
void(*set_desc)(msi_alloc_info_t *arg,
struct msi_desc *desc);
-   int (*domain_alloc_irqs)(struct irq_domain *domain,
-struct device *dev, int nvec);
-   void(*domain_free_irqs)(struct irq_domain *domain,
-   struct device *dev);
+   int (*domain_alloc_irqs)(struct irq_domain *domain, struct 
device *dev,
+struct msi_range *range);
+   void(*domain_free_irqs)(struct irq_domain *domain, struct 
device *dev,
+   struct msi_range *range);
 };
 
 /**
@@ -484,13 +501,12 @@ int msi_domain_set_affinity(struct irq_d
 struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 struct msi_domain_info *info,
 struct irq_domain *parent);
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
-   int nvec);
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, 
struct msi_range *range);
 int msi_domain_alloc_irqs_descs_locked(struct irq_domai

[patch 02/10] genirq/msi: Add range argument to msi_domain_alloc/free_descs_locked()

2021-11-26 Thread Thomas Gleixner
In preparation for supporting range allocations for MSI-X, add a range
argument to the msi_domain_alloc/free_descs_locked() functions and fixup
all affected places.

Hand in ranges which are covering the current use case. They will be
refined in later steps.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/irqdomain.c |6 --
 include/linux/msi.h |5 ++---
 kernel/irq/msi.c|   21 -
 3 files changed, 18 insertions(+), 14 deletions(-)

--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -10,22 +10,24 @@
 
 int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
+   struct msi_range range = { .first = 0, .last = UINT_MAX, .ndesc = nvec};
struct irq_domain *domain;
 
domain = dev_get_msi_domain(&dev->dev);
if (domain && irq_domain_is_hierarchy(domain))
-   return msi_domain_alloc_irqs_descs_locked(domain, &dev->dev, 
nvec);
+   return msi_domain_alloc_irqs_descs_locked(domain, &dev->dev, 
&range);
 
return pci_msi_legacy_setup_msi_irqs(dev, nvec, type);
 }
 
 void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 {
+   struct msi_range range = { .first = 0, .last = UINT_MAX, };
struct irq_domain *domain;
 
domain = dev_get_msi_domain(&dev->dev);
if (domain && irq_domain_is_hierarchy(domain))
-   msi_domain_free_irqs_descs_locked(domain, &dev->dev);
+   msi_domain_free_irqs_descs_locked(domain, &dev->dev, &range);
else
pci_msi_legacy_teardown_msi_irqs(dev);
 }
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -502,12 +502,11 @@ struct irq_domain *msi_create_irq_domain
 struct msi_domain_info *info,
 struct irq_domain *parent);
 int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, 
struct msi_range *range);
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev,
-  int nvec);
+int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev,  struct msi_range *range);
 int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
  int nvec);
 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev, 
struct msi_range *range);
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev);
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev, struct msi_range *range);
 void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
 struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);
 
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -956,22 +956,21 @@ static int msi_domain_add_simple_msi_des
  * Return: %0 on success or an error code.
  */
 int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev,
-  int nvec)
+  struct msi_range *range)
 {
struct msi_domain_info *info = domain->host_data;
-   struct msi_range range = { .ndesc = nvec };
struct msi_domain_ops *ops = info->ops;
int ret;
 
lockdep_assert_held(&dev->msi.data->mutex);
 
-   ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
+   ret = msi_domain_add_simple_msi_descs(info, dev, range->ndesc);
if (ret)
return ret;
 
-   ret = ops->domain_alloc_irqs(domain, dev, &range);
+   ret = ops->domain_alloc_irqs(domain, dev, range);
if (ret)
-   msi_domain_free_irqs_descs_locked(domain, dev);
+   msi_domain_free_irqs_descs_locked(domain, dev, range);
return ret;
 }
 
@@ -986,10 +985,11 @@ int msi_domain_alloc_irqs_descs_locked(s
  */
 int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int 
nvec)
 {
+   struct msi_range range = { .first = 0, .last = UINT_MAX, .ndesc = nvec, 
};
int ret;
 
msi_lock_descs(dev);
-   ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+   ret = msi_domain_alloc_irqs_descs_locked(domain, dev, &range);
msi_unlock_descs(dev);
return ret;
 }
@@ -1034,14 +1034,15 @@ static void msi_domain_free_msi_descs(st
  * pair. Use this for MSI irqdomains which implement their own vector
  * allocation.
  */
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev)
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct 
device *dev,
+  struct msi_range *range)
 {
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
 
lockdep_assert_held(&dev->msi.data->mutex);
 
-   ops->domain_free_irqs(domain, dev, NULL);
+   ops->domain_free_irqs(domain, dev, range);
  

[patch 00/10] genirq/msi, PCI/MSI: Support for dynamic MSI-X vector expansion - Part 4

2021-11-26 Thread Thomas Gleixner
This is finally the point where dynamically expanding MSI-X vectors after
enabling MSI-X is implemented.

The first three parts of this work can be found here:

https://lore.kernel.org/r/20211126222700.862407...@linutronix.de
https://lore.kernel.org/r/20211126224100.303046...@linutronix.de
https://lore.kernel.org/r/20211126230957.239391...@linutronix.de

This last and smallest part of the overall series contains the following
changes:

   1) Prepare the core MSI irq domain code to handle range based allocation
  and free

   2) Prepare the PCI/MSI code to handle range based allocation and free
  
   3) Implement a new interface which allows to expand the MSI-X vector
  space after initialization

   4) Enable support for the X86 PCI/MSI irq domains

  This is unfortunate, but some PCI/MSI irq domain implementations,
  e.g. powerpc and the x86/XEN irqdomain wrappers are not really ready
  to support this out of the box.

  I looked at the 30 places which implement PCI/MSI irq domains and
  many of them look like they could support it out of the box, but as
  we have two which definitely don't, making this opt-in is the only
  safe option.

I've tested this by hacking up the XHCI driver and it works like a charm.

There is certainly some more room for consolidating the PCI/MSI-X usage in
drivers, i.e. getting rid of pci_enable_msix*(), but this would have made
this overall series even larger and is an orthogonal issue.

This fourth series is based on:

 git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git msi-v1-part-3

and also available from git:

 git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git msi-v1-part-4

Thanks,

tglx
---
 arch/powerpc/platforms/pseries/msi.c |6 +-
 arch/x86/kernel/apic/msi.c   |4 -
 arch/x86/pci/xen.c   |   10 +--
 drivers/base/platform-msi.c  |3 -
 drivers/pci/msi/irqdomain.c  |   39 ++
 drivers/pci/msi/msi.c|   97 +++
 drivers/pci/msi/msi.h|4 -
 include/linux/msi.h  |   46 +++-
 include/linux/pci.h  |   13 
 kernel/irq/msi.c |   75 +++
 10 files changed, 208 insertions(+), 89 deletions(-)


[patch 16/22] PCI/MSI: Split out CONFIG_PCI_MSI independent part

2021-11-26 Thread Thomas Gleixner
These functions are required even when CONFIG_PCI_MSI is not set. Move them
to their own file.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi/Makefile |3 ++-
 drivers/pci/msi/msi.c|   39 ---
 drivers/pci/msi/pcidev_msi.c |   43 +++
 3 files changed, 45 insertions(+), 40 deletions(-)

--- a/drivers/pci/msi/Makefile
+++ b/drivers/pci/msi/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 #
 # Makefile for the PCI/MSI
-obj-$(CONFIG_PCI)  += msi.o
+obj-$(CONFIG_PCI)  += pcidev_msi.o
+obj-$(CONFIG_PCI_MSI)  += msi.o
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -18,8 +18,6 @@
 
 #include "../pci.h"
 
-#ifdef CONFIG_PCI_MSI
-
 static int pci_msi_enable = 1;
 int pci_msi_ignore_mask;
 
@@ -1479,40 +1477,3 @@ bool pci_dev_has_special_msi_domain(stru
 }
 
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
-#endif /* CONFIG_PCI_MSI */
-
-void pci_msi_init(struct pci_dev *dev)
-{
-   u16 ctrl;
-
-   /*
-* Disable the MSI hardware to avoid screaming interrupts
-* during boot.  This is the power on reset default so
-* usually this should be a noop.
-*/
-   dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
-   if (!dev->msi_cap)
-   return;
-
-   pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
-   if (ctrl & PCI_MSI_FLAGS_ENABLE)
-   pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
- ctrl & ~PCI_MSI_FLAGS_ENABLE);
-
-   if (!(ctrl & PCI_MSI_FLAGS_64BIT))
-   dev->no_64bit_msi = 1;
-}
-
-void pci_msix_init(struct pci_dev *dev)
-{
-   u16 ctrl;
-
-   dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-   if (!dev->msix_cap)
-   return;
-
-   pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
-   if (ctrl & PCI_MSIX_FLAGS_ENABLE)
-   pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
- ctrl & ~PCI_MSIX_FLAGS_ENABLE);
-}
--- /dev/null
+++ b/drivers/pci/msi/pcidev_msi.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MSI[X} related functions which are available unconditionally.
+ */
+#include "../pci.h"
+
+/*
+ * Disable the MSI[X] hardware to avoid screaming interrupts during boot.
+ * This is the power on reset default so usually this should be a noop.
+ */
+
+void pci_msi_init(struct pci_dev *dev)
+{
+   u16 ctrl;
+
+   dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
+   if (!dev->msi_cap)
+   return;
+
+   pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
+   if (ctrl & PCI_MSI_FLAGS_ENABLE) {
+   pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
+ ctrl & ~PCI_MSI_FLAGS_ENABLE);
+   }
+
+   if (!(ctrl & PCI_MSI_FLAGS_64BIT))
+   dev->no_64bit_msi = 1;
+}
+
+void pci_msix_init(struct pci_dev *dev)
+{
+   u16 ctrl;
+
+   dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+   if (!dev->msix_cap)
+   return;
+
+   pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
+   if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
+   pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
+ ctrl & ~PCI_MSIX_FLAGS_ENABLE);
+   }
+}




[patch 14/22] PCI/MSI: Make msix_update_entries() smarter

2021-11-26 Thread Thomas Gleixner
No need to walk the descriptors and check for each one whether the entries
pointer function argument is NULL. Do it once.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -642,8 +642,8 @@ static void msix_update_entries(struct p
 {
struct msi_desc *entry;
 
-   for_each_pci_msi_entry(entry, dev) {
-   if (entries) {
+   if (entries) {
+   for_each_pci_msi_entry(entry, dev) {
entries->vector = entry->irq;
entries++;
}




[patch 13/22] PCI/MSI: Cleanup include zoo

2021-11-26 Thread Thomas Gleixner
Get rid of the pile of unneeded includes which accumulated over time.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi.c |   16 
 1 file changed, 4 insertions(+), 12 deletions(-)

--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -7,22 +7,14 @@
  * Copyright (C) 2016 Christoph Hellwig.
  */
 
+#include 
 #include 
-#include 
-#include 
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
+#include 
 #include 
+#include 
 #include 
+#include 
 
 #include "pci.h"
 




[patch 12/22] PCI/MSI: Make arch_restore_msi_irqs() less horrible.

2021-11-26 Thread Thomas Gleixner
Make arch_restore_msi_irqs() return a boolean which indicates whether the
core code should restore the MSI message or not. Get rid of the indirection
in x86.

Signed-off-by: Thomas Gleixner 
Cc: Juergen Gross 
Cc: x...@kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: Christian Borntraeger 
Cc: Heiko Carstens 
---
 arch/s390/pci/pci_irq.c   |4 +-
 arch/x86/include/asm/x86_init.h   |6 ---
 arch/x86/include/asm/xen/hypervisor.h |8 +
 arch/x86/kernel/apic/msi.c|6 +++
 arch/x86/kernel/x86_init.c|   12 ---
 arch/x86/pci/xen.c|   13 
 drivers/pci/msi.c |   54 +++---
 include/linux/msi.h   |7 +---
 8 files changed, 45 insertions(+), 65 deletions(-)

--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -387,13 +387,13 @@ void arch_teardown_msi_irqs(struct pci_d
airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, 
zdev->msi_nr_irqs);
 }
 
-void arch_restore_msi_irqs(struct pci_dev *pdev)
+bool arch_restore_msi_irqs(struct pci_dev *pdev)
 {
struct zpci_dev *zdev = to_zpci(pdev);
 
if (!zdev->irqs_registered)
zpci_set_irq(zdev);
-   default_restore_msi_irqs(pdev);
+   return true;
 }
 
 static struct airq_struct zpci_airq = {
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -289,12 +289,6 @@ struct x86_platform_ops {
struct x86_hyper_runtime hyper;
 };
 
-struct pci_dev;
-
-struct x86_msi_ops {
-   void (*restore_msi_irqs)(struct pci_dev *dev);
-};
-
 struct x86_apic_ops {
unsigned int(*io_apic_read)   (unsigned int apic, unsigned int reg);
void(*restore)(void);
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,6 +57,14 @@ static inline bool __init xen_x2apic_par
 }
 #endif
 
+struct pci_dev;
+
+#ifdef CONFIG_XEN_DOM0
+bool xen_initdom_restore_msi(struct pci_dev *dev);
+#else
+static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; 
}
+#endif
+
 #ifdef CONFIG_HOTPLUG_CPU
 void xen_arch_register_cpu(int num);
 void xen_arch_unregister_cpu(int num);
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -345,3 +346,8 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
 }
 #endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+   return xen_initdom_restore_msi(dev);
+}
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -145,18 +145,6 @@ struct x86_platform_ops x86_platform __r
 
 EXPORT_SYMBOL_GPL(x86_platform);
 
-#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi __ro_after_init = {
-   .restore_msi_irqs   = default_restore_msi_irqs,
-};
-
-/* MSI arch specific hooks */
-void arch_restore_msi_irqs(struct pci_dev *dev)
-{
-   x86_msi.restore_msi_irqs(dev);
-}
-#endif
-
 struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read   = native_io_apic_read,
.restore= native_restore_boot_irq_mode,
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -351,10 +351,13 @@ static int xen_initdom_setup_msi_irqs(st
return ret;
 }
 
-static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
+bool xen_initdom_restore_msi(struct pci_dev *dev)
 {
int ret = 0;
 
+   if (!xen_initial_domain())
+   return true;
+
if (pci_seg_supported) {
struct physdev_pci_device restore_ext;
 
@@ -375,10 +378,10 @@ static void xen_initdom_restore_msi_irqs
ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
}
+   return false;
 }
 #else /* CONFIG_XEN_PV_DOM0 */
 #define xen_initdom_setup_msi_irqs NULL
-#define xen_initdom_restore_msi_irqs   NULL
 #endif /* !CONFIG_XEN_PV_DOM0 */
 
 static void xen_teardown_msi_irqs(struct pci_dev *dev)
@@ -466,12 +469,10 @@ static __init struct irq_domain *xen_cre
 static __init void xen_setup_pci_msi(void)
 {
if (xen_pv_domain()) {
-   if (xen_initial_domain()) {
+   if (xen_initial_domain())
xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs;
-   x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
-   } else {
+   else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
-   }
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -106,29 +106,6 @@ void __weak arch_teardown_msi_irqs(struc
 }
 #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
-static void default_restore_msi_irq(struct pci_dev *dev, int irq)
-{
-  

[patch 11/22] x86/hyperv: Refactor hv_msi_domain_free_irqs()

2021-11-26 Thread Thomas Gleixner
No point in looking up things over and over. Just look up the associated
irq data and work from there.

No functional change.

Signed-off-by: Thomas Gleixner 
Cc: Wei Liu 
Cc: x...@kernel.org
Cc: linux-hyp...@vger.kernel.org
---
 arch/x86/hyperv/irqdomain.c |   55 +---
 1 file changed, 17 insertions(+), 38 deletions(-)

--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -253,64 +253,43 @@ static int hv_unmap_msi_interrupt(struct
return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, 
old_entry);
 }
 
-static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc 
*msidesc, int irq)
+static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
 {
-   u64 status;
struct hv_interrupt_entry old_entry;
-   struct irq_desc *desc;
-   struct irq_data *data;
struct msi_msg msg;
+   u64 status;
 
-   desc = irq_to_desc(irq);
-   if (!desc) {
-   pr_debug("%s: no irq desc\n", __func__);
-   return;
-   }
-
-   data = &desc->irq_data;
-   if (!data) {
-   pr_debug("%s: no irq data\n", __func__);
-   return;
-   }
-
-   if (!data->chip_data) {
+   if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
return;
}
 
-   old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+   old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
entry_to_msi_msg(&old_entry, &msg);
 
-   kfree(data->chip_data);
-   data->chip_data = NULL;
+   kfree(irqd->chip_data);
+   irqd->chip_data = NULL;
 
status = hv_unmap_msi_interrupt(dev, &old_entry);
 
-   if (status != HV_STATUS_SUCCESS) {
+   if (status != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
-   return;
-   }
 }
 
-static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device 
*dev)
+static void hv_msi_free_irq(struct irq_domain *domain,
+   struct msi_domain_info *info, unsigned int virq)
 {
-   int i;
-   struct msi_desc *entry;
-   struct pci_dev *pdev;
+   struct irq_data *irqd = irq_get_irq_data(virq);
+   struct msi_desc *desc;
 
-   if (WARN_ON_ONCE(!dev_is_pci(dev)))
+   if (!irqd)
return;
 
-   pdev = to_pci_dev(dev);
+   desc = irq_data_get_msi_desc(irqd);
+   if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev)))
+   return;
 
-   for_each_pci_msi_entry(entry, pdev) {
-   if (entry->irq) {
-   for (i = 0; i < entry->nvec_used; i++) {
-   hv_teardown_msi_irq_common(pdev, entry, 
entry->irq + i);
-   irq_domain_free_irqs(entry->irq + i, 1);
-   }
-   }
-   }
+   hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
 }
 
 /*
@@ -329,7 +308,7 @@ static struct irq_chip hv_pci_msi_contro
 };
 
 static struct msi_domain_ops pci_msi_domain_ops = {
-   .domain_free_irqs   = hv_msi_domain_free_irqs,
+   .msi_free   = hv_msi_free_irq,
.msi_prepare= pci_msi_prepare,
 };
 




[patch 10/22] genirq/msi, treewide: Use a named struct for PCI/MSI attributes

2021-11-26 Thread Thomas Gleixner
The unnamed struct sucks and is in the way of further cleanups. Stick the
PCI related MSI data into a real data structure and cleanup all users.

No functional change.

Signed-off-by: Thomas Gleixner 
Cc: Greg Kroah-Hartman 
Cc: sparcli...@vger.kernel.org
Cc: x...@kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: ath...@lists.infradead.org
---
 arch/powerpc/platforms/cell/axon_msi.c|2 
 arch/powerpc/platforms/powernv/pci-ioda.c |4 -
 arch/powerpc/platforms/pseries/msi.c  |6 -
 arch/sparc/kernel/pci_msi.c   |4 -
 arch/x86/kernel/apic/msi.c|2 
 arch/x86/pci/xen.c|6 -
 drivers/net/wireless/ath/ath11k/pci.c |2 
 drivers/pci/msi.c |  116 +++---
 drivers/pci/xen-pcifront.c|2 
 include/linux/msi.h   |   84 ++---
 kernel/irq/msi.c  |4 -
 11 files changed, 115 insertions(+), 117 deletions(-)

--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -212,7 +212,7 @@ static int setup_msi_msg_address(struct
entry = first_pci_msi_entry(dev);
 
for (; dn; dn = of_get_next_parent(dn)) {
-   if (entry->msi_attrib.is_64) {
+   if (entry->pci.msi_attrib.is_64) {
prop = of_get_property(dn, "msi-address-64", &len);
if (prop)
break;
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2154,10 +2154,10 @@ static void pnv_msi_compose_msg(struct i
int rc;
 
rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
- entry->msi_attrib.is_64, msg);
+ entry->pci.msi_attrib.is_64, msg);
if (rc)
dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n",
-   entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+   entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, 
rc);
 }
 
 /*
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -332,7 +332,7 @@ static int check_msix_entries(struct pci
 
expected = 0;
for_each_pci_msi_entry(entry, pdev) {
-   if (entry->msi_attrib.entry_nr != expected) {
+   if (entry->pci.msi_attrib.entry_nr != expected) {
pr_debug("rtas_msi: bad MSI-X entries.\n");
return -EINVAL;
}
@@ -449,7 +449,7 @@ static int pseries_msi_ops_prepare(struc
 {
struct pci_dev *pdev = to_pci_dev(dev);
struct msi_desc *desc = first_pci_msi_entry(pdev);
-   int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+   int type = desc->pci.msi_attrib.is_msix ? PCI_CAP_ID_MSIX : 
PCI_CAP_ID_MSI;
 
return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
@@ -580,7 +580,7 @@ static int pseries_irq_domain_alloc(stru
int hwirq;
int i, ret;
 
-   hwirq = rtas_query_irq_number(pci_get_pdn(pdev), 
desc->msi_attrib.entry_nr);
+   hwirq = rtas_query_irq_number(pci_get_pdn(pdev), 
desc->pci.msi_attrib.entry_nr);
if (hwirq < 0) {
dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
return hwirq;
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -146,13 +146,13 @@ static int sparc64_setup_msi_irq(unsigne
msiqid = pick_msiq(pbm);
 
err = ops->msi_setup(pbm, msiqid, msi,
-(entry->msi_attrib.is_64 ? 1 : 0));
+(entry->pci.msi_attrib.is_64 ? 1 : 0));
if (err)
goto out_msi_free;
 
pbm->msi_irq_table[msi - pbm->msi_first] = *irq_p;
 
-   if (entry->msi_attrib.is_64) {
+   if (entry->pci.msi_attrib.is_64) {
msg.address_hi = pbm->msi64_start >> 32;
msg.address_lo = pbm->msi64_start & 0x;
} else {
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -163,7 +163,7 @@ int pci_msi_prepare(struct irq_domain *d
struct msi_desc *desc = first_pci_msi_entry(pdev);
 
init_irq_alloc_info(arg, NULL);
-   if (desc->msi_attrib.is_msix) {
+   if (desc->pci.msi_attrib.is_msix) {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -306,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(st
return -EINVAL;
 
map_irq.table_base = pci_resource_start(dev, bir);
-   map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+   map_irq.entry_nr = msidesc->pci.msi_attrib.entry_nr;
}
 
ret = -EINVAL;
@@

[patch 09/22] MIPS: Octeon: Use arch_setup_msi_irq()

2021-11-26 Thread Thomas Gleixner
The core code provides the same loop code except for the MSI-X reject. Move
that to arch_setup_msi_irq() and remove the duplicated code.

No functional change.

Signed-off-by: Thomas Gleixner 
Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
---
 arch/mips/pci/msi-octeon.c |   32 +++-
 1 file changed, 3 insertions(+), 29 deletions(-)

--- a/arch/mips/pci/msi-octeon.c
+++ b/arch/mips/pci/msi-octeon.c
@@ -68,6 +68,9 @@ int arch_setup_msi_irq(struct pci_dev *d
u64 search_mask;
int index;
 
+   if (desc->pci.msi_attrib.is_msix)
+   return -EINVAL;
+
/*
 * Read the MSI config to figure out how many IRQs this device
 * wants.  Most devices only want 1, which will give
@@ -182,35 +185,6 @@ int arch_setup_msi_irq(struct pci_dev *d
return 0;
 }
 
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-   struct msi_desc *entry;
-   int ret;
-
-   /*
-* MSI-X is not supported.
-*/
-   if (type == PCI_CAP_ID_MSIX)
-   return -EINVAL;
-
-   /*
-* If an architecture wants to support multiple MSI, it needs to
-* override arch_setup_msi_irqs()
-*/
-   if (type == PCI_CAP_ID_MSI && nvec > 1)
-   return 1;
-
-   for_each_pci_msi_entry(entry, dev) {
-   ret = arch_setup_msi_irq(dev, entry);
-   if (ret < 0)
-   return ret;
-   if (ret > 0)
-   return -ENOSPC;
-   }
-
-   return 0;
-}
-
 /**
  * Called when a device no longer needs its MSI interrupts. All
  * MSI interrupts for the device are freed.




[patch 08/22] PCI/sysfs: Use pci_irq_vector()

2021-11-26 Thread Thomas Gleixner
instead of fiddling with msi descriptors.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/pci-sysfs.c |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -62,11 +62,8 @@ static ssize_t irq_show(struct device *d
 * For MSI, show the first MSI IRQ; for all other cases including
 * MSI-X, show the legacy INTx IRQ.
 */
-   if (pdev->msi_enabled) {
-   struct msi_desc *desc = first_pci_msi_entry(pdev);
-
-   return sysfs_emit(buf, "%u\n", desc->irq);
-   }
+   if (pdev->msi_enabled)
+   return sysfs_emit(buf, "%u\n", pci_irq_vector(pdev, 0));
 #endif
 
return sysfs_emit(buf, "%u\n", pdev->irq);




[patch 07/22] PCI/MSI: Remove msi_desc_to_pci_sysdata()

2021-11-26 Thread Thomas Gleixner
Last user is gone long ago.

Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi.c   |8 
 include/linux/msi.h |5 -
 2 files changed, 13 deletions(-)

--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1253,14 +1253,6 @@ struct pci_dev *msi_desc_to_pci_dev(stru
 }
 EXPORT_SYMBOL(msi_desc_to_pci_dev);
 
-void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
-{
-   struct pci_dev *dev = msi_desc_to_pci_dev(desc);
-
-   return dev->bus->sysdata;
-}
-EXPORT_SYMBOL_GPL(msi_desc_to_pci_sysdata);
-
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 /**
  * pci_msi_domain_write_msg - Helper to write MSI message to PCI config space
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -217,13 +217,8 @@ static inline void msi_desc_set_iommu_co
for_each_msi_entry((desc), &(pdev)->dev)
 
 struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc);
-void *msi_desc_to_pci_sysdata(struct msi_desc *desc);
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg);
 #else /* CONFIG_PCI_MSI */
-static inline void *msi_desc_to_pci_sysdata(struct msi_desc *desc)
-{
-   return NULL;
-}
 static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 }




[patch 05/22] genirq/msi: Fixup includes

2021-11-26 Thread Thomas Gleixner
Remove the kobject.h include from msi.h as it's not required and add a
sysfs.h include to the core code instead.

Signed-off-by: Thomas Gleixner 
---
 include/linux/msi.h |1 -
 kernel/irq/msi.c|1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -2,7 +2,6 @@
 #ifndef LINUX_MSI_H
 #define LINUX_MSI_H
 
-#include 
 #include 
 #include 
 
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "internals.h"




[patch 06/22] PCI/MSI: Make pci_msi_domain_write_msg() static

2021-11-26 Thread Thomas Gleixner
There is no point to have this function public as it is set by the PCI core
anyway when a PCI/MSI irqdomain is created.

Signed-off-by: Thomas Gleixner 
---
 drivers/irqchip/irq-gic-v2m.c|1 -
 drivers/irqchip/irq-gic-v3-its-pci-msi.c |1 -
 drivers/irqchip/irq-gic-v3-mbi.c |1 -
 drivers/pci/msi.c|2 +-
 include/linux/msi.h  |1 -
 5 files changed, 1 insertion(+), 5 deletions(-)

--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -88,7 +88,6 @@ static struct irq_chip gicv2m_msi_irq_ch
.irq_mask   = gicv2m_mask_msi_irq,
.irq_unmask = gicv2m_unmask_msi_irq,
.irq_eoi= irq_chip_eoi_parent,
-   .irq_write_msi_msg  = pci_msi_domain_write_msg,
 };
 
 static struct msi_domain_info gicv2m_msi_domain_info = {
--- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -28,7 +28,6 @@ static struct irq_chip its_msi_irq_chip
.irq_unmask = its_unmask_msi_irq,
.irq_mask   = its_mask_msi_irq,
.irq_eoi= irq_chip_eoi_parent,
-   .irq_write_msi_msg  = pci_msi_domain_write_msg,
 };
 
 static int its_pci_msi_vec_count(struct pci_dev *pdev, void *data)
--- a/drivers/irqchip/irq-gic-v3-mbi.c
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@@ -171,7 +171,6 @@ static struct irq_chip mbi_msi_irq_chip
.irq_unmask = mbi_unmask_msi_irq,
.irq_eoi= irq_chip_eoi_parent,
.irq_compose_msi_msg= mbi_compose_msi_msg,
-   .irq_write_msi_msg  = pci_msi_domain_write_msg,
 };
 
 static struct msi_domain_info mbi_msi_domain_info = {
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1267,7 +1267,7 @@ EXPORT_SYMBOL_GPL(msi_desc_to_pci_sysdat
  * @irq_data:  Pointer to interrupt data of the MSI interrupt
  * @msg:   Pointer to the message
  */
-void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
+static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg 
*msg)
 {
struct msi_desc *desc = irq_data_get_msi_desc(irq_data);
 
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -454,7 +454,6 @@ void *platform_msi_get_host_data(struct
 #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
 
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
-void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg);
 struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 struct msi_domain_info *info,
 struct irq_domain *parent);




[patch 04/22] genirq/msi: Remove unused domain callbacks

2021-11-26 Thread Thomas Gleixner
No users and there is no need to grow them.

Signed-off-by: Thomas Gleixner 
---
 include/linux/msi.h |   11 ---
 kernel/irq/msi.c|5 -
 2 files changed, 4 insertions(+), 12 deletions(-)

--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -304,7 +304,6 @@ struct msi_domain_info;
  * @msi_free:  Domain specific function to free a MSI interrupts
  * @msi_check: Callback for verification of the domain/info/dev data
  * @msi_prepare:   Prepare the allocation of the interrupts in the domain
- * @msi_finish:Optional callback to finalize the allocation
  * @set_desc:  Set the msi descriptor for an interrupt
  * @handle_error:  Optional error handler if the allocation fails
  * @domain_alloc_irqs: Optional function to override the default allocation
@@ -312,12 +311,11 @@ struct msi_domain_info;
  * @domain_free_irqs:  Optional function to override the default free
  * function.
  *
- * @get_hwirq, @msi_init and @msi_free are callbacks used by
- * msi_create_irq_domain() and related interfaces
+ * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying
+ * irqdomain.
  *
- * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error
- * are callbacks used by msi_domain_alloc_irqs() and related
- * interfaces which are based on msi_desc.
+ * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by
+ * msi_domain_alloc/free_irqs().
  *
  * @domain_alloc_irqs, @domain_free_irqs can be used to override the
  * default allocation/free functions (__msi_domain_alloc/free_irqs). This
@@ -351,7 +349,6 @@ struct msi_domain_ops {
int (*msi_prepare)(struct irq_domain *domain,
   struct device *dev, int nvec,
   msi_alloc_info_t *arg);
-   void(*msi_finish)(msi_alloc_info_t *arg, int retval);
void(*set_desc)(msi_alloc_info_t *arg,
struct msi_desc *desc);
int (*handle_error)(struct irq_domain *domain,
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -562,8 +562,6 @@ int __msi_domain_alloc_irqs(struct irq_d
ret = -ENOSPC;
if (ops->handle_error)
ret = ops->handle_error(domain, desc, ret);
-   if (ops->msi_finish)
-   ops->msi_finish(&arg, ret);
return ret;
}
 
@@ -573,9 +571,6 @@ int __msi_domain_alloc_irqs(struct irq_d
}
}
 
-   if (ops->msi_finish)
-   ops->msi_finish(&arg, 0);
-
can_reserve = msi_check_reservation_mode(domain, info, dev);
 
/*




[patch 03/22] genirq/msi: Guard sysfs code

2021-11-26 Thread Thomas Gleixner
No point in building unused code when CONFIG_SYSFS=n.

Signed-off-by: Thomas Gleixner 
---
 include/linux/msi.h |   10 ++
 kernel/irq/msi.c|2 ++
 2 files changed, 12 insertions(+)

--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -239,9 +239,19 @@ void __pci_write_msi_msg(struct msi_desc
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
+#ifdef CONFIG_SYSFS
 const struct attribute_group **msi_populate_sysfs(struct device *dev);
 void msi_destroy_sysfs(struct device *dev,
   const struct attribute_group **msi_irq_groups);
+#else
+static inline const struct attribute_group **msi_populate_sysfs(struct device 
*dev)
+{
+   return NULL;
+}
+static inline void msi_destroy_sysfs(struct device *dev, const struct 
attribute_group **msi_irq_groups)
+{
+}
+#endif
 
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -72,6 +72,7 @@ void get_cached_msi_msg(unsigned int irq
 }
 EXPORT_SYMBOL_GPL(get_cached_msi_msg);
 
+#ifdef CONFIG_SYSFS
 static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
 char *buf)
 {
@@ -204,6 +205,7 @@ void msi_destroy_sysfs(struct device *de
kfree(msi_irq_groups);
}
 }
+#endif
 
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 static inline void irq_chip_write_msi_msg(struct irq_data *data,




[patch 02/22] PCI/MSI: Fix pci_irq_vector()/pci_irq_get_attinity()

2021-11-26 Thread Thomas Gleixner
pci_irq_vector() and pci_irq_get_affinity() use the list position to find the
MSI-X descriptor at a given index. That's correct for the normal case where
the entry number is the same as the list position.

But it's wrong for cases where MSI-X was allocated with an entries array
describing sparse entry numbers into the hardware message descriptor
table. That's inconsistent at best.

Make it always check the entry number because that's what the zero base
index really means. This change won't break existing users which use a
sparse entries array for allocation because these users retrieve the Linux
interrupt number from the entries array after allocation and none of them
uses pci_irq_vector() or pci_irq_get_affinity().

Fixes: aff171641d18 ("PCI: Provide sensible IRQ vector alloc/free routines")
Signed-off-by: Thomas Gleixner 
---
 drivers/pci/msi.c |   26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1187,19 +1187,24 @@ EXPORT_SYMBOL(pci_free_irq_vectors);
 
 /**
  * pci_irq_vector - return Linux IRQ number of a device vector
- * @dev: PCI device to operate on
- * @nr: device-relative interrupt vector index (0-based).
+ * @dev:   PCI device to operate on
+ * @nr:Interrupt vector index (0-based)
+ *
+ * @nr has the following meanings depending on the interrupt mode:
+ *   MSI-X:The index in the MSI-X vector table
+ *   MSI:  The index of the enabled MSI vectors
+ *   INTx: Must be 0
+ *
+ * Return: The Linux interrupt number or -EINVAl if @nr is out of range.
  */
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 {
if (dev->msix_enabled) {
struct msi_desc *entry;
-   int i = 0;
 
for_each_pci_msi_entry(entry, dev) {
-   if (i == nr)
+   if (entry->msi_attrib.entry_nr == nr)
return entry->irq;
-   i++;
}
WARN_ON_ONCE(1);
return -EINVAL;
@@ -1223,17 +1228,22 @@ EXPORT_SYMBOL(pci_irq_vector);
  * pci_irq_get_affinity - return the affinity of a particular MSI vector
  * @dev:   PCI device to operate on
  * @nr:device-relative interrupt vector index (0-based).
+ *
+ * @nr has the following meanings depending on the interrupt mode:
+ *   MSI-X:The index in the MSI-X vector table
+ *   MSI:  The index of the enabled MSI vectors
+ *   INTx: Must be 0
+ *
+ * Return: A cpumask pointer or NULL if @nr is out of range
  */
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 {
if (dev->msix_enabled) {
struct msi_desc *entry;
-   int i = 0;
 
for_each_pci_msi_entry(entry, dev) {
-   if (i == nr)
+   if (entry->msi_attrib.entry_nr == nr)
return &entry->affinity->mask;
-   i++;
}
WARN_ON_ONCE(1);
return NULL;




[patch 00/22] genirq/msi, PCI/MSI: Spring cleaning - Part 1

2021-11-26 Thread Thomas Gleixner
The [PCI] MSI code has gained quite some warts over time. A recent
discussion unearthed a shortcoming: the lack of support for expanding
PCI/MSI-X vectors after initialization of MSI-X.

PCI/MSI-X has no requirement to setup all vectors when MSI-X is enabled in
the device. The non-used vectors have just to be masked in the vector
table. For PCI/MSI this is not possible because the number of vectors
cannot be changed after initialization.

The PCI/MSI code, but also the core MSI irq domain code are built around
the assumption that all required vectors are installed at initialization
time and freed when the device is shut down by the driver.

Supporting dynamic expansion at least for MSI-X is important for VFIO so
that the host side interrupts for passthrough devices can be installed on
demand.

This is the first part of a large (total 101 patches) series which
refactors the [PCI]MSI infrastructure to make runtime expansion of MSI-X
vectors possible. The last part (10 patches) provide this functionality.

The first part is mostly a cleanup which consolidates code, moves the PCI
MSI code into a separate directory and splits it up into several parts.

No functional change intended except for patch 2/N which changes the
behaviour of pci_get_vector()/affinity() to get rid of the assumption that
the provided index is the "index" into the descriptor list instead of using
it as the actual MSI[X] index as seen by the hardware. This would break
users of sparse allocated MSI-X entries, but non of them use these
functions.

This series is based on 5.16-rc2 and also available via git:

 git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git msi-v1-part-1

For the curious who can't wait for the next part to arrive the full series
is available via:

 git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git msi-v1-part-4

Thanks,

tglx
---
 arch/powerpc/platforms/4xx/msi.c|  281 
 b/Documentation/driver-api/pci/pci.rst  |2 
 b/arch/mips/pci/msi-octeon.c|   32 -
 b/arch/powerpc/platforms/4xx/Makefile   |1 
 b/arch/powerpc/platforms/cell/axon_msi.c|2 
 b/arch/powerpc/platforms/powernv/pci-ioda.c |4 
 b/arch/powerpc/platforms/pseries/msi.c  |6 
 b/arch/powerpc/sysdev/Kconfig   |6 
 b/arch/s390/pci/pci_irq.c   |4 
 b/arch/sparc/kernel/pci_msi.c   |4 
 b/arch/x86/hyperv/irqdomain.c   |   55 --
 b/arch/x86/include/asm/x86_init.h   |6 
 b/arch/x86/include/asm/xen/hypervisor.h |8 
 b/arch/x86/kernel/apic/msi.c|8 
 b/arch/x86/kernel/x86_init.c|   12 
 b/arch/x86/pci/xen.c|   19 
 b/drivers/irqchip/irq-gic-v2m.c |1 
 b/drivers/irqchip/irq-gic-v3-its-pci-msi.c  |1 
 b/drivers/irqchip/irq-gic-v3-mbi.c  |1 
 b/drivers/net/wireless/ath/ath11k/pci.c |2 
 b/drivers/pci/Makefile  |3 
 b/drivers/pci/msi/Makefile  |7 
 b/drivers/pci/msi/irqdomain.c   |  267 +++
 b/drivers/pci/msi/legacy.c  |   79 +++
 b/drivers/pci/msi/msi.c |  645 
 b/drivers/pci/msi/msi.h |   39 +
 b/drivers/pci/msi/pcidev_msi.c  |   43 +
 b/drivers/pci/pci-sysfs.c   |7 
 b/drivers/pci/xen-pcifront.c|2 
 b/include/linux/msi.h   |  135 ++---
 b/include/linux/pci.h   |1 
 b/kernel/irq/msi.c  |   41 +
 32 files changed, 696 insertions(+), 1028 deletions(-)


[patch 01/22] powerpc/4xx: Remove MSI support which never worked

2021-11-26 Thread Thomas Gleixner
This code is broken since day one. ppc4xx_setup_msi_irqs() has the
following gems:

 1) The handling of the result of msi_bitmap_alloc_hwirqs() is completely
broken:

When the result is greater than or equal 0 (bitmap allocation
successful) then the loop terminates and the function returns 0
(success) despite not having installed an interrupt.

When the result is less than 0 (bitmap allocation fails), it prints an
error message and continues to "work" with that error code which would
eventually end up in the MSI message data.

 2) On every invocation the file global pp4xx_msi::msi_virqs bitmap is
allocated thereby leaking the previous one.

IOW, this has never worked and for more than 10 years nobody cared. Remove
the gunk.

Fixes: 3fb7933850fa ("powerpc/4xx: Adding PCIe MSI support")
Fixes: 247540b03bfc ("powerpc/44x: Fix PCI MSI support for Maui APM821xx SoC 
and Bluestone board")
Signed-off-by: Thomas Gleixner 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: Benjamin Herrenschmidt 
Cc: linuxppc-...@lists.ozlabs.org
---
 arch/powerpc/platforms/4xx/Makefile |1 
 arch/powerpc/platforms/4xx/msi.c|  281 
 arch/powerpc/sysdev/Kconfig |6 
 3 files changed, 288 deletions(-)

--- a/arch/powerpc/platforms/4xx/Makefile
+++ b/arch/powerpc/platforms/4xx/Makefile
@@ -3,6 +3,5 @@ obj-y   += uic.o machine_check.o
 obj-$(CONFIG_4xx_SOC)  += soc.o
 obj-$(CONFIG_PCI)  += pci.o
 obj-$(CONFIG_PPC4xx_HSTA_MSI)  += hsta_msi.o
-obj-$(CONFIG_PPC4xx_MSI)   += msi.o
 obj-$(CONFIG_PPC4xx_CPM)   += cpm.o
 obj-$(CONFIG_PPC4xx_GPIO)  += gpio.o
--- a/arch/powerpc/platforms/4xx/msi.c
+++ /dev/null
@@ -1,281 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Adding PCI-E MSI support for PPC4XX SoCs.
- *
- * Copyright (c) 2010, Applied Micro Circuits Corporation
- * Authors:Tirumala R Marri 
- * Feng Kan 
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#define PEIH_TERMADH   0x00
-#define PEIH_TERMADL   0x08
-#define PEIH_MSIED 0x10
-#define PEIH_MSIMK 0x18
-#define PEIH_MSIASS0x20
-#define PEIH_FLUSH00x30
-#define PEIH_FLUSH10x38
-#define PEIH_CNTRST0x48
-
-static int msi_irqs;
-
-struct ppc4xx_msi {
-   u32 msi_addr_lo;
-   u32 msi_addr_hi;
-   void __iomem *msi_regs;
-   int *msi_virqs;
-   struct msi_bitmap bitmap;
-   struct device_node *msi_dev;
-};
-
-static struct ppc4xx_msi ppc4xx_msi;
-
-static int ppc4xx_msi_init_allocator(struct platform_device *dev,
-   struct ppc4xx_msi *msi_data)
-{
-   int err;
-
-   err = msi_bitmap_alloc(&msi_data->bitmap, msi_irqs,
- dev->dev.of_node);
-   if (err)
-   return err;
-
-   err = msi_bitmap_reserve_dt_hwirqs(&msi_data->bitmap);
-   if (err < 0) {
-   msi_bitmap_free(&msi_data->bitmap);
-   return err;
-   }
-
-   return 0;
-}
-
-static int ppc4xx_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-   int int_no = -ENOMEM;
-   unsigned int virq;
-   struct msi_msg msg;
-   struct msi_desc *entry;
-   struct ppc4xx_msi *msi_data = &ppc4xx_msi;
-
-   dev_dbg(&dev->dev, "PCIE-MSI:%s called. vec %x type %d\n",
-   __func__, nvec, type);
-   if (type == PCI_CAP_ID_MSIX)
-   pr_debug("ppc4xx msi: MSI-X untested, trying anyway.\n");
-
-   msi_data->msi_virqs = kmalloc_array(msi_irqs, sizeof(int), GFP_KERNEL);
-   if (!msi_data->msi_virqs)
-   return -ENOMEM;
-
-   for_each_pci_msi_entry(entry, dev) {
-   int_no = msi_bitmap_alloc_hwirqs(&msi_data->bitmap, 1);
-   if (int_no >= 0)
-   break;
-   if (int_no < 0) {
-   pr_debug("%s: fail allocating msi interrupt\n",
-   __func__);
-   }
-   virq = irq_of_parse_and_map(msi_data->msi_dev, int_no);
-   if (!virq) {
-   dev_err(&dev->dev, "%s: fail mapping irq\n", __func__);
-   msi_bitmap_free_hwirqs(&msi_data->bitmap, int_no, 1);
-   return -ENOSPC;
-   }
-   dev_dbg(&dev->dev, "%s: virq = %d\n", __func__, virq);
-
-   /* Setup msi address space */
-   msg.address_hi = msi_data->msi_addr_hi;
-   msg.address_lo = msi_data->msi_addr_lo;
-
-   irq_set_msi_desc(virq, entry);
-   msg.data = int_no;
-   pci_write_msi_msg(virq, &msg);
-   }
-   return 0;
-}
-
-void ppc4xx_teardown_msi_irqs(struct pci_dev *dev)
-{
-   struct msi_desc *entry;
-   struct ppc4xx_msi *msi_data = &ppc4xx_msi;
-   irq_hw_number_t hwirq;
-
-   dev_dbg(&dev->

[ovmf test] 166826: all pass - PUSHED

2021-11-26 Thread osstest service owner
flight 166826 ovmf real [real]
http://logs.test-lab.xenproject.org/osstest/logs/166826/

Perfect :-)
All tests in this flight passed as required
version targeted for testing:
 ovmf bb1bba3d776733c41dbfa2d1dc0fe234819a79f2
baseline version:
 ovmf 4c7ce0d285bc7fd593718fd5dec02e136cbfad8e

Last test of basis   166360  2021-11-24 15:41:50 Z2 days
Testing same since   166826  2021-11-26 09:11:08 Z0 days1 attempts


People who touched revisions under test:
  Baraneedharan Anbazhagan 
  Liming Gao 

jobs:
 build-amd64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/osstest/ovmf.git
   4c7ce0d285..bb1bba3d77  bb1bba3d776733c41dbfa2d1dc0fe234819a79f2 -> 
xen-tested-master



Re: [PATCH 00/65] x86: Support for CET Indirect Branch Tracking

2021-11-26 Thread Andrew Cooper
On 26/11/2021 13:22, Jan Beulich wrote:
> On 26.11.2021 14:13, Andrew Cooper wrote:
>> On 26/11/2021 12:48, Jan Beulich wrote:
>>> On 26.11.2021 13:33, Andrew Cooper wrote:
   * I have not checked for misaligned endbr64's, and I'm not sure there is
 anything useful we could do upon discovering that there were any.
 Naively, there is a 1 in 2^32 chance (endbr64 being 4 bytes long), but
 this doesn't account for the structure of x86 code, which is most
 certainly not a uniform random distribution of bytes.
>>> Do you really mean "misaligned" here? The 2nd sentence rather might suggest
>>> that you mean byte sequences resembling ENDBR, despite actually being part
>>> of other insns. If so, checking might not allow to prove anything, as e.g.
>>> displacements change with about every build.
>> I do mean "any sequence of bytes resembling ENDBR", because that is
>> ultimately how the CPU instruction decode will behave.
>>
>> And yes - you certainly can hide it in a 4-byte disp/imm, but it's an
>> incredibly rare imm32 to find (except for tasks such as in patch 64).

[Answering out of order]
>> You can also hide it in an disp/imm8 followed by a specific nopl, but
>> I'm not sure if we'd ever emit 0F 1E FA as a nopl by default.
> We don't, and the tool chain doesn't either. Only canonical NOPs (opcode
> 0x1F) are to be used there, as all others may gain a meaning beyond
> plain NOP.

Good.  Presuming that this continues to be true, the "endbr64 bridging
two instructions" looks like:

F3 0F 1E FA - real endbr64
0F 1E FA - Not emitted by toolchains
1E xx - push %ds which is #UD in 64bit
FA - cli

So local_irq_{save,disable}() need to be a little wary, but this is far
more constrained than I was anticipating.

> A disp alone won't do in general, as the top byte will only ever be 0x00
> or 0xFF (as long as our binary image doesn't go beyond 16Mb).

Tangent... I thought I'd lifted all the 16M restrictions when I rewrote
the pagetable handling, but the linker assert is still present so
clearly something is still hanging around.

For a call/jump disp32, 0xF30F1EFA is nearly -2G so we're not in any
danger of encountering that, given the 1G upper limit on .text/.data/etc.

However, disp32s on memory operands are effectively arbitrary, and there
are tricks like:

    incl  ASM_PERFC_exceptions * 4(%rcx, %rax, 4)

where the disp32 field isn't even a "usual" offset.

> But a
> ModR/M or SIB byte could start such a sequence, with only two or three
> of the (lower) disp bytes used to complete the pattern.

Luckily, a ModRM of F3 is a reg/reg encoding (ebx and esi), with no SIB
byte, so there is no ModRM=F3, SIB=0F case to worry about.

That leaves:

1) ModRM=F3 with 0F 1E FA in imm32, or
2) ModRM=F3 with 0F 1E in imm16 and a trailing CLI instruction, or
3) SIB=F3, an (%rbx, %rsi, 8)-ish operand with 0F 1E FA coming from imm,
disp or the following instruction.

These look to have rather more wiggle room, but still don't look as if
they'd be common to encounter.

Perhaps the two most worrying areas are imm64 constants, and the
div-by-constant reciprocal that tends to yield a large imm32 for use
with mul.


Given that Marek has kindly hacked us up a check which should find any
arbitrary violations, and on a small sample of builds, there are no
violations, I suggest that we clean it up and put it as a check in the
real build and enable it by default seeing as we're right at the start
of the 4.17 dev window.

If it is seen to trip (and it might well not), we can judge at that
point whether to rearrange the code to avoid it, or drop the check. 
Until then however, it gives us a very strong security statement.

~Andrew



Re: Aarch64 stand-alone application for Xen

2021-11-26 Thread Mathieu Poirier
On Fri, 26 Nov 2021 at 03:32, Bertrand Marquis  wrote:
>
> Hi Mathieu,
>
> > On 25 Nov 2021, at 22:59, Mathieu Poirier  
> > wrote:
> >
> > Good day,
> >
> > I am in the process of adding support for aarch64 to the xen-sys
> > crate[1].  The crate currently supports x86_64 and includes a
> > stand-alone "oxerun" application that can be used to validate
> > hypercalls.  My goal is to provide the same functionality on arm64.  I
> > am looking for a stand-alone aarch64 example, something like an "hello
> > world" to help me with the assembler startup code.
>
> We are working on porting XTF to arm64 and already have something running.
> I think it could be a good starting point for you:
> https://github.com/orzelmichal/xtf/tree/arm-devel
>

I just ran the "test-arm-mmu64le-example" and things work as
advertised - this is really nice!  I will have to see how this thing
is put together but the outcome is exactly what I was looking for.

You're awesome - thanks,
Mathieu

> Regards
> Bertrand
>
> >
> > Many thanks for the consideration,
> > Mathieu
> >
> > [1]. https://crates.io/crates/xen-sys
> >
>



[xen-4.15-testing test] 166387: tolerable FAIL - PUSHED

2021-11-26 Thread osstest service owner
flight 166387 xen-4.15-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/166387/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 166198
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 166198
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 166198
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 166198
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 166198
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 166198
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 166198
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 166198
 test-amd64-i386-xl-qemut-win7-amd64 19 guest-stop fail like 166198
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 166198
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 166198
 test-amd64-i386-xl-qemut-ws16-amd64 19 guest-stop fail like 166198
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass

version targeted for testing:
 xen  544e547a63175ac6ef7cc29c4f5bda88da024f69
baseline version:
 xen  963ab606b122

[PATCH 3/4] xen/xsm: Use __init_data_cf_clobber for xsm_ops

2021-11-26 Thread Andrew Cooper
All calls through xsm_ops are fully altcall'd.  Harden all fnptr targets.

This yields:

  (XEN) altcall: Optimised away 197 endbr64 instructions

of 1655 on an everything-enabled build of Xen, which is ~12%.

Signed-off-by: Andrew Cooper 
---
CC: Daniel De Graaf 
CC: Daniel Smith 
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/xsm/dummy.c   | 2 +-
 xen/xsm/flask/hooks.c | 2 +-
 xen/xsm/silo.c| 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index 4d29a9aa5b9f..4f1d352d5507 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -13,7 +13,7 @@
 #define XSM_NO_WRAPPERS
 #include 
 
-static const struct xsm_ops __initconstrel dummy_ops = {
+static struct xsm_ops __initdata_cf_clobber dummy_ops = {
 .security_domaininfo   = xsm_security_domaininfo,
 .domain_create = xsm_domain_create,
 .getdomaininfo = xsm_getdomaininfo,
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 63484e323c09..b1c917113ec3 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -1765,7 +1765,7 @@ static int cf_check flask_argo_send(
 
 #endif
 
-static const struct xsm_ops __initconstrel flask_ops = {
+static struct xsm_ops __initdata_cf_clobber flask_ops = {
 .security_domaininfo = flask_security_domaininfo,
 .domain_create = flask_domain_create,
 .getdomaininfo = flask_getdomaininfo,
diff --git a/xen/xsm/silo.c b/xen/xsm/silo.c
index 4d5fc98e7e54..7a17595888bb 100644
--- a/xen/xsm/silo.c
+++ b/xen/xsm/silo.c
@@ -102,7 +102,7 @@ static int cf_check silo_argo_send(
 
 #endif
 
-static const struct xsm_ops __initconstrel silo_xsm_ops = {
+static struct xsm_ops __initdata_cf_clobber silo_xsm_ops = {
 .evtchn_unbound = silo_evtchn_unbound,
 .evtchn_interdomain = silo_evtchn_interdomain,
 .grant_mapref = silo_grant_mapref,
-- 
2.11.0




[PATCH 1/4] x86/altcall: Check and optimise altcall targets

2021-11-26 Thread Andrew Cooper
When converting indirect to direct calls, there is no need to execute endbr64
instructions.  Detect and optimise this case, leaving a warning in the case
that no endbr64 was found, as it likely indicates a build error.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/arch/x86/alternative.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
index ec24692e9595..5ae4c80d5119 100644
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -279,6 +280,27 @@ static void init_or_livepatch _apply_alternatives(struct 
alt_instr *start,
 
 if ( dest )
 {
+/*
+ * When building for CET-IBT, all function pointer targets
+ * should have an endbr64 instruction.
+ *
+ * If this is not the case, leave a warning because
+ * something is wrong with the build.
+ *
+ * Otherwise, skip the endbr64 instruction.  This is a
+ * marginal perf improvement which saves on instruction
+ * decode bandwidth.
+ */
+if ( IS_ENABLED(CONFIG_HAS_CC_CET_IBT) )
+{
+if ( is_endbr64(dest) )
+dest += 4;
+else
+printk(XENLOG_WARNING
+   "altcall %ps dest %ps has no endbr64\n",
+   orig, dest);
+}
+
 disp = dest - (orig + 5);
 ASSERT(disp == (int32_t)disp);
 *(int32_t *)(buf + 1) = disp;
-- 
2.11.0




[PATCH 2/4] x86/altcall: Optimise away endbr64 instruction where possible

2021-11-26 Thread Andrew Cooper
With altcall, we convert indirect branches into direct ones.  With that
complete, none of the potential targets need an endbr64 instruction.

Furthermore, removing the endbr64 instructions is a security defence-in-depth
improvement, because it limits the options available to an attacker who has
managed to hijack a function pointer.

Introduce a new .init.data.cf_clobber section.  Have _apply_alternatives()
walk over the entire section, looking for any pointers into .text, and clobber
an endbr64 instruction if found.  This is some minor structure (ab)use but it
works alarmingly well.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 

It would be nice for the printk() to say "optimised away %u of %u", but the
latter number can only feasibly come from post-processing of xen-syms during
the build.
---
 xen/arch/x86/alternative.c | 38 ++
 xen/arch/x86/xen.lds.S |  5 +
 xen/include/xen/init.h |  2 ++
 3 files changed, 45 insertions(+)

diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
index 5ae4c80d5119..65fc8534b97f 100644
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -173,6 +173,9 @@ text_poke(void *addr, const void *opcode, size_t len)
 return memcpy(addr, opcode, len);
 }
 
+extern unsigned long __initdata_cf_clobber_start[];
+extern unsigned long __initdata_cf_clobber_end[];
+
 /*
  * Replace instructions with better alternatives for this CPU type.
  * This runs before SMP is initialized to avoid SMP problems with
@@ -329,6 +332,41 @@ static void init_or_livepatch _apply_alternatives(struct 
alt_instr *start,
 add_nops(buf + a->repl_len, total_len - a->repl_len);
 text_poke(orig, buf, total_len);
 }
+
+/*
+ * Clobber endbr64 instructions now that altcall has finished optimised
+ * all indirect branches to direct ones.
+ */
+if ( force && cpu_has_xen_ibt )
+{
+unsigned long *val;
+unsigned int clobbered = 0;
+
+/*
+ * This is some minor structure (ab)use.  We walk the entire contents
+ * of .init.data.cf_clobber as if it were an array of pointers.
+ *
+ * If the pointer points into .text, and has an endbr64 instruction,
+ * nop out the endbr64.  This causes the pointer to no longer be a
+ * legal indirect branch target under CET-IBT.  This is a
+ * defence-in-depth measure, to reduce the options available to an
+ * adversary who has managed to hijack a function pointer.
+ */
+for ( val = __initdata_cf_clobber_start;
+  val < __initdata_cf_clobber_end;
+  val++ )
+{
+void *ptr = (void *)*val;
+
+if ( !is_kernel_text(ptr) || !is_endbr64(ptr) )
+continue;
+
+add_nops(ptr, 4);
+clobbered++;
+}
+
+printk("altcall: Optimised away %u endbr64 instructions\n", clobbered);
+}
 }
 
 void init_or_livepatch apply_alternatives(struct alt_instr *start,
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 87e344d4dd97..5b16a98e4df1 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -214,6 +214,11 @@ SECTIONS
*(.initcall1.init)
__initcall_end = .;
 
+   . = ALIGN(POINTER_ALIGN);
+__initdata_cf_clobber_start = .;
+   *(.init.data.cf_clobber)
+__initdata_cf_clobber_end = .;
+
*(.init.data)
*(.init.data.rel)
*(.init.data.rel.*)
diff --git a/xen/include/xen/init.h b/xen/include/xen/init.h
index bfe789e93f6b..66b324892a52 100644
--- a/xen/include/xen/init.h
+++ b/xen/include/xen/init.h
@@ -18,6 +18,8 @@
 #define __init_call(lvl)  __used_section(".initcall" lvl ".init")
 #define __exit_call   __used_section(".exitcall.exit")
 
+#define __initdata_cf_clobber __section(".init.data.cf_clobber")
+
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
-- 
2.11.0




[PATCH 4/4] x86/ucode: Use altcall, and __initdata_cf_clobber

2021-11-26 Thread Andrew Cooper
Microcode loading is not a fastpath, but there are control flow security
benefits from using altcall()'s hardening side effect.

Convert the existing microcode_ops pointer into a __read_mostly structure, and
move {amd,intel}_ucode_ops into __initdata_cf_clobber.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/arch/x86/cpu/microcode/amd.c |  2 +-
 xen/arch/x86/cpu/microcode/core.c| 38 +++-
 xen/arch/x86/cpu/microcode/intel.c   |  2 +-
 xen/arch/x86/cpu/microcode/private.h |  2 +-
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/xen/arch/x86/cpu/microcode/amd.c b/xen/arch/x86/cpu/microcode/amd.c
index 0afa2192bf1d..27c8644ab8ba 100644
--- a/xen/arch/x86/cpu/microcode/amd.c
+++ b/xen/arch/x86/cpu/microcode/amd.c
@@ -422,7 +422,7 @@ static struct microcode_patch *cf_check 
cpu_request_microcode(
 return patch;
 }
 
-const struct microcode_ops amd_ucode_ops = {
+struct microcode_ops __initdata_cf_clobber amd_ucode_ops = {
 .cpu_request_microcode= cpu_request_microcode,
 .collect_cpu_info = collect_cpu_info,
 .apply_microcode  = apply_microcode,
diff --git a/xen/arch/x86/cpu/microcode/core.c 
b/xen/arch/x86/cpu/microcode/core.c
index f84dafa82693..755f2dc9a1e5 100644
--- a/xen/arch/x86/cpu/microcode/core.c
+++ b/xen/arch/x86/cpu/microcode/core.c
@@ -21,6 +21,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -214,7 +215,7 @@ void __init microcode_grab_module(
 microcode_scan_module(module_map, mbi);
 }
 
-static const struct microcode_ops __read_mostly *microcode_ops;
+static struct microcode_ops __read_mostly ucode_ops;
 
 static DEFINE_SPINLOCK(microcode_mutex);
 
@@ -241,9 +242,9 @@ static const struct microcode_patch *nmi_patch = 
ZERO_BLOCK_PTR;
  */
 static struct microcode_patch *parse_blob(const char *buf, size_t len)
 {
-microcode_ops->collect_cpu_info();
+alternative_vcall(ucode_ops.collect_cpu_info);
 
-return microcode_ops->cpu_request_microcode(buf, len);
+return alternative_call(ucode_ops.cpu_request_microcode, buf, len);
 }
 
 static void microcode_free_patch(struct microcode_patch *patch)
@@ -258,8 +259,8 @@ static bool microcode_update_cache(struct microcode_patch 
*patch)
 
 if ( !microcode_cache )
 microcode_cache = patch;
-else if ( microcode_ops->compare_patch(patch,
-   microcode_cache) == NEW_UCODE )
+else if ( alternative_call(ucode_ops.compare_patch,
+   patch, microcode_cache) == NEW_UCODE )
 {
 microcode_free_patch(microcode_cache);
 microcode_cache = patch;
@@ -311,14 +312,14 @@ static int microcode_update_cpu(const struct 
microcode_patch *patch)
 {
 int err;
 
-microcode_ops->collect_cpu_info();
+alternative_vcall(ucode_ops.collect_cpu_info);
 
 spin_lock(µcode_mutex);
 if ( patch )
-err = microcode_ops->apply_microcode(patch);
+err = alternative_call(ucode_ops.apply_microcode, patch);
 else if ( microcode_cache )
 {
-err = microcode_ops->apply_microcode(microcode_cache);
+err = alternative_call(ucode_ops.apply_microcode, microcode_cache);
 if ( err == -EIO )
 {
 microcode_free_patch(microcode_cache);
@@ -368,7 +369,7 @@ static int primary_thread_work(const struct microcode_patch 
*patch)
 if ( !wait_for_state(LOADING_ENTER) )
 return -EBUSY;
 
-ret = microcode_ops->apply_microcode(patch);
+ret = alternative_call(ucode_ops.apply_microcode, patch);
 if ( !ret )
 atomic_inc(&cpu_updated);
 atomic_inc(&cpu_out);
@@ -481,7 +482,7 @@ static int control_thread_fn(const struct microcode_patch 
*patch)
 }
 
 /* Control thread loads ucode first while others are in NMI handler. */
-ret = microcode_ops->apply_microcode(patch);
+ret = alternative_call(ucode_ops.apply_microcode, patch);
 if ( !ret )
 atomic_inc(&cpu_updated);
 atomic_inc(&cpu_out);
@@ -610,7 +611,8 @@ static long cf_check microcode_update_helper(void *data)
  */
 spin_lock(µcode_mutex);
 if ( microcode_cache &&
- microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
+ alternative_call(ucode_ops.compare_patch,
+  patch, microcode_cache) != NEW_UCODE )
 {
 spin_unlock(µcode_mutex);
 printk(XENLOG_WARNING "microcode: couldn't find any newer revision "
@@ -678,7 +680,7 @@ int microcode_update(XEN_GUEST_HANDLE(const_void) buf, 
unsigned long len)
 if ( len != (uint32_t)len )
 return -E2BIG;
 
-if ( microcode_ops == NULL )
+if ( !ucode_ops.apply_microcode )
 return -EINVAL;
 
 buffer = xmalloc_flex_struct(struct ucode_buf, buffer, len);
@@ -722,10 +724,10 @@ __initcall(microcode_init);
 /* Load a cached upda

[PATCH 0/4] x86: Further harden function pointers

2021-11-26 Thread Andrew Cooper
Slightly RFC, because patch 2 has some minor structure (ab)use, but the result
works alarmingly well.  So far, this demonstrates converting two subsystems.

hvm_funcs is the other area of especially low hanging fruit, but IOMMU, vPMU
also look like good candidates.  Anything which is partially altcall'd already
would benefit from being fully altcall'd.

Should we consider introducing __ro_after_init right now (as an alias to
__read_mostly) as this conversion is touching a lot of ares where true
post-init immutability ought to be enforced.

Andrew Cooper (4):
  x86/altcall: Check and optimise altcall targets
  x86/altcall: Optimise away endbr64 instruction where possible
  xen/xsm: Use __init_data_cf_clobber for xsm_ops
  x86/ucode: Use altcall, and __initdata_cf_clobber

 xen/arch/x86/alternative.c   | 60 
 xen/arch/x86/cpu/microcode/amd.c |  2 +-
 xen/arch/x86/cpu/microcode/core.c| 38 ---
 xen/arch/x86/cpu/microcode/intel.c   |  2 +-
 xen/arch/x86/cpu/microcode/private.h |  2 +-
 xen/arch/x86/xen.lds.S   |  5 +++
 xen/include/xen/init.h   |  2 ++
 xen/xsm/dummy.c  |  2 +-
 xen/xsm/flask/hooks.c|  2 +-
 xen/xsm/silo.c   |  2 +-
 10 files changed, 93 insertions(+), 24 deletions(-)

-- 
2.11.0




[linux-linus test] 166380: tolerable FAIL - PUSHED

2021-11-26 Thread osstest service owner
flight 166380 linux-linus real [real]
flight 166914 linux-linus real-retest [real]
http://logs.test-lab.xenproject.org/osstest/logs/166380/
http://logs.test-lab.xenproject.org/osstest/logs/166914/

Failures :-/ but no regressions.

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-xl-qemut-debianhvm-i386-xsm 12 debian-hvm-install fail pass 
in 166914-retest

Regressions which are regarded as allowable (not blocking):
 test-armhf-armhf-xl-rtds18 guest-start/debian.repeat fail REGR. vs. 166266

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 166266
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 166266
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 166266
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 166266
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 166266
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 166266
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 166266
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 166266
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass

version targeted for testing:
 linux5f53fa508db098c9d372423a6dac31c8a5679cdf
baseline version:
 linux136057256686de39cc3a07c2e39ef6bc43003ff6

Last test of basis   166266  2021-11-22 06:26:56 Z4 days
Failing since166328  2021-11-23 22:12:35 Z2 days2 attempts
Testing same since   166380  2021-11-25 06:37:51 Z1 days1 attempts

-

Re: [xen-unstable test] 166378: regressions - FAIL

2021-11-26 Thread Ian Jackson
osstest service owner writes ("[xen-unstable test] 166378: regressions - FAIL"):
> flight 166378 xen-unstable real [real]
> http://logs.test-lab.xenproject.org/osstest/logs/166378/
> 
> Regressions :-(
> 
> Tests which did not succeed and are blocking,
> including tests which could not be run:
>  build-amd64-prev  6 xen-buildfail REGR. vs. 
> 166304

git submodules.  Horror.

Cloning into 
'/home/osstest/build.166378.build-amd64-prev/xen/tools/firmware/ovmf-dir-remote/UnitTestFrameworkPkg/Library/CmockaLib/cmocka'...
fatal: remote error: git-cache-proxy: git remote died with error exit code 1 // 
Fetching origin // fatal: unable to access 
'https://git.cryptomilk.org/projects/cmocka.git/': Failed to connect to 
git.cryptomilk.org port 443: Connection refused // error: Could not fetch origin
fatal: clone of 'https://git.cryptomilk.org/projects/cmocka.git' into submodule 
path 
'/home/osstest/build.166378.build-amd64-prev/xen/tools/firmware/ovmf-dir-remote/UnitTestFrameworkPkg/Library/CmockaLib/cmocka'
 failed
Failed to clone 'UnitTestFrameworkPkg/Library/CmockaLib/cmocka'. Retry scheduled
Cloning into 
'/home/osstest/build.166378.build-amd64-prev/xen/tools/firmware/ovmf-dir-remote/UnitTestFrameworkPkg/Library/CmockaLib/cmocka'...
fatal: remote error: git-cache-proxy: git remote died with error exit code 1 // 
Fetching origin // fatal: unable to access 
'https://git.cryptomilk.org/projects/cmocka.git/': Failed to connect to 
git.cryptomilk.org port 443: Connection refused // error: Could not fetch origin
fatal: clone of 'https://git.cryptomilk.org/projects/cmocka.git' into submodule 
path 
'/home/osstest/build.166378.build-amd64-prev/xen/tools/firmware/ovmf-dir-remote/UnitTestFrameworkPkg/Library/CmockaLib/cmocka'
 failed
Failed to clone 'UnitTestFrameworkPkg/Library/CmockaLib/cmocka' a second time, 
aborting

>  test-amd64-amd64-xl-pvshim   20 guest-localmigrate/x10   fail REGR. vs. 
> 166304

ssh to guest took >10s.

Ian.



Re: [GIT PULL] xen: branch for v5.16-rc3

2021-11-26 Thread pr-tracker-bot
The pull request you sent on Fri, 26 Nov 2021 16:31:52 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git 
> for-linus-5.16c-rc3-tag

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/6b54698aec0b59943f7e8a88151bdf208de990d0

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html



Re: [PATCH 59.5/65] x86: Introduce helpers/checks for endbr64 instructions

2021-11-26 Thread Andrew Cooper
On 26/11/2021 18:26, Marek Marczykowski-Górecki wrote:
> On Fri, Nov 26, 2021 at 04:33:40PM +, Andrew Cooper wrote:
>> ... to prevent the optimiser creating unsafe code.  See the code comment for
>> full details.
>>
>> Also add a build time check for endbr64 embedded in imm32 operands, which
>> catches the obvious cases where the optimiser has done an unsafe thing.
>>
>> Signed-off-by: Andrew Cooper 
>> ---
>> CC: Jan Beulich 
>> CC: Roger Pau Monné 
>> CC: Wei Liu 
>> ---
>>  xen/arch/x86/Makefile   |  4 
>>  xen/include/asm-x86/endbr.h | 55 
>> +
>>  2 files changed, 59 insertions(+)
>>  create mode 100644 xen/include/asm-x86/endbr.h
>>
>> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
>> index 69b6cfaded25..64a5c0d20018 100644
>> --- a/xen/arch/x86/Makefile
>> +++ b/xen/arch/x86/Makefile
>> @@ -190,6 +190,10 @@ $(TARGET)-syms: prelink.o xen.lds
>>  $(MAKE) -f $(BASEDIR)/Rules.mk efi-y= $(@D)/.$(@F).1.o
>>  $(LD) $(XEN_LDFLAGS) -T xen.lds -N prelink.o $(build_id_linker) \
>>  $(@D)/.$(@F).1.o -o $@
>> +ifeq ($(CONFIG_XEN_IBT),y)
>> +$(OBJDUMP) -d $@ | grep 0xfa1e0ff3 >/dev/null && \
>> +{ echo "Found embedded endbr64 instructions" >&2; false; } || :
>> +endif
> Some more robust check can be done this way (warning, PoC quality bash):
>
> objcopy -j .text xen-syms xen-syms.text
> offset=$(objdump -h xen-syms -j .text | tail -2|head -1|awk '{printf 
> "%x\n", (strtonum("0x" $4) - strtonum("0x" $6))}')
> objdump --adjust-vma=-0x$offset -d xen-syms.text|grep endbr | cut -f 1 -d 
> ':' | tr -d ' ' > valid-addrs
> grep -aob $'\xf3\x0f\x1e\xfa' xen-syms.text|cut -f 1 -d :|xargs printf 
> '%x\n' > all-addrs
> join -v 2 <(sort valid-addrs) <(sort all-addrs) | awk '{ printf "%x\n", 
> 0x'$offset' + strtonum("0x" $1)}' | addr2line -e xen-syms
>
> Currently it finds just one match:
> xen/arch/x86/alternative.c:145

To be clear, this one match is on the xen-cet-ibt v1.1 branch, which
also includes the next task (runtime clobbering of unused ENDBR
instructions) which I'm currently cleaning up to post.

~Andrew



Re: [PATCH 59.5/65] x86: Introduce helpers/checks for endbr64 instructions

2021-11-26 Thread Marek Marczykowski-Górecki
On Fri, Nov 26, 2021 at 04:33:40PM +, Andrew Cooper wrote:
> ... to prevent the optimiser creating unsafe code.  See the code comment for
> full details.
> 
> Also add a build time check for endbr64 embedded in imm32 operands, which
> catches the obvious cases where the optimiser has done an unsafe thing.
> 
> Signed-off-by: Andrew Cooper 
> ---
> CC: Jan Beulich 
> CC: Roger Pau Monné 
> CC: Wei Liu 
> ---
>  xen/arch/x86/Makefile   |  4 
>  xen/include/asm-x86/endbr.h | 55 
> +
>  2 files changed, 59 insertions(+)
>  create mode 100644 xen/include/asm-x86/endbr.h
> 
> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
> index 69b6cfaded25..64a5c0d20018 100644
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -190,6 +190,10 @@ $(TARGET)-syms: prelink.o xen.lds
>   $(MAKE) -f $(BASEDIR)/Rules.mk efi-y= $(@D)/.$(@F).1.o
>   $(LD) $(XEN_LDFLAGS) -T xen.lds -N prelink.o $(build_id_linker) \
>   $(@D)/.$(@F).1.o -o $@
> +ifeq ($(CONFIG_XEN_IBT),y)
> + $(OBJDUMP) -d $@ | grep 0xfa1e0ff3 >/dev/null && \
> + { echo "Found embedded endbr64 instructions" >&2; false; } || :
> +endif

Some more robust check can be done this way (warning, PoC quality bash):

objcopy -j .text xen-syms xen-syms.text
offset=$(objdump -h xen-syms -j .text | tail -2|head -1|awk '{printf 
"%x\n", (strtonum("0x" $4) - strtonum("0x" $6))}')
objdump --adjust-vma=-0x$offset -d xen-syms.text|grep endbr | cut -f 1 -d 
':' | tr -d ' ' > valid-addrs
grep -aob $'\xf3\x0f\x1e\xfa' xen-syms.text|cut -f 1 -d :|xargs printf 
'%x\n' > all-addrs
join -v 2 <(sort valid-addrs) <(sort all-addrs) | awk '{ printf "%x\n", 
0x'$offset' + strtonum("0x" $1)}' | addr2line -e xen-syms

Currently it finds just one match:
xen/arch/x86/alternative.c:145

-- 
Best Regards,
Marek Marczykowski-Górecki
Invisible Things Lab


signature.asc
Description: PGP signature


[PATCH v4 17/25] sh: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Signed-off-by: Dmitry Osipenko 
---
 arch/sh/kernel/reboot.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/sh/kernel/reboot.c b/arch/sh/kernel/reboot.c
index 5c33f036418b..e8eeedc9b182 100644
--- a/arch/sh/kernel/reboot.c
+++ b/arch/sh/kernel/reboot.c
@@ -46,8 +46,7 @@ static void native_machine_shutdown(void)
 
 static void native_machine_power_off(void)
 {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
 }
 
 static void native_machine_halt(void)
-- 
2.33.1




[PATCH v4 18/25] x86: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Signed-off-by: Dmitry Osipenko 
---
 arch/x86/kernel/reboot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0a40df66a40d..cd7d9416d81a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -747,10 +747,10 @@ static void native_machine_halt(void)
 
 static void native_machine_power_off(void)
 {
-   if (pm_power_off) {
+   if (kernel_can_power_off()) {
if (!reboot_force)
machine_shutdown();
-   pm_power_off();
+   do_kernel_power_off();
}
/* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
-- 
2.33.1




[PATCH v4 21/25] nds32: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Signed-off-by: Dmitry Osipenko 
---
 arch/nds32/kernel/process.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c
index 49fab9e39cbf..0936dcd7db1b 100644
--- a/arch/nds32/kernel/process.c
+++ b/arch/nds32/kernel/process.c
@@ -54,8 +54,7 @@ EXPORT_SYMBOL(machine_halt);
 
 void machine_power_off(void)
 {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
 }
 
 EXPORT_SYMBOL(machine_power_off);
-- 
2.33.1




[PATCH v4 12/25] arm64: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Catalin Marinas 
Signed-off-by: Dmitry Osipenko 
---
 arch/arm64/kernel/process.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aacf2f5559a8..f8db031afa7d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -110,8 +110,7 @@ void machine_power_off(void)
 {
local_irq_disable();
smp_send_stop();
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
 }
 
 /*
-- 
2.33.1




[PATCH v4 16/25] m68k: Switch to new sys-off handler API

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use
register_power_off_handler() that registers power-off handlers and
do_kernel_power_off() that invokes chained power-off handlers. Legacy
pm_power_off() will be removed once all drivers will be converted to
the new power-off API.

Normally arch code should adopt only the do_kernel_power_off() at first,
but m68k is a special case because it uses pm_power_off() "inside out",
i.e. pm_power_off() invokes machine_power_off() [in fact it does nothing],
while it's machine_power_off() that should invoke the pm_power_off(), and
thus, we can't convert platforms to the new API separately. There are only
two platforms changed here, so it's not a big deal.

Acked-by: Geert Uytterhoeven 
Signed-off-by: Dmitry Osipenko 
---
 arch/m68k/emu/natfeat.c | 3 ++-
 arch/m68k/include/asm/machdep.h | 1 -
 arch/m68k/kernel/process.c  | 5 ++---
 arch/m68k/kernel/setup_mm.c | 1 -
 arch/m68k/kernel/setup_no.c | 1 -
 arch/m68k/mac/config.c  | 4 +++-
 6 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/m68k/emu/natfeat.c b/arch/m68k/emu/natfeat.c
index 71b78ecee75c..b19dc00026d9 100644
--- a/arch/m68k/emu/natfeat.c
+++ b/arch/m68k/emu/natfeat.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -90,5 +91,5 @@ void __init nf_init(void)
pr_info("NatFeats found (%s, %lu.%lu)\n", buf, version >> 16,
version & 0x);
 
-   mach_power_off = nf_poweroff;
+   register_platform_power_off(nf_poweroff);
 }
diff --git a/arch/m68k/include/asm/machdep.h b/arch/m68k/include/asm/machdep.h
index 8fd80ef1b77e..8d8c3ee2069f 100644
--- a/arch/m68k/include/asm/machdep.h
+++ b/arch/m68k/include/asm/machdep.h
@@ -24,7 +24,6 @@ extern int (*mach_get_rtc_pll)(struct rtc_pll_info *);
 extern int (*mach_set_rtc_pll)(struct rtc_pll_info *);
 extern void (*mach_reset)( void );
 extern void (*mach_halt)( void );
-extern void (*mach_power_off)( void );
 extern unsigned long (*mach_hd_init) (unsigned long, unsigned long);
 extern void (*mach_hd_setup)(char *, int *);
 extern void (*mach_heartbeat) (int);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index a6030dbaa089..e160a7c57bd3 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -67,12 +67,11 @@ void machine_halt(void)
 
 void machine_power_off(void)
 {
-   if (mach_power_off)
-   mach_power_off();
+   do_kernel_power_off();
for (;;);
 }
 
-void (*pm_power_off)(void) = machine_power_off;
+void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 void show_regs(struct pt_regs * regs)
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 4b51bfd38e5f..50f4f120a4ff 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -98,7 +98,6 @@ EXPORT_SYMBOL(mach_get_rtc_pll);
 EXPORT_SYMBOL(mach_set_rtc_pll);
 void (*mach_reset)( void );
 void (*mach_halt)( void );
-void (*mach_power_off)( void );
 #ifdef CONFIG_HEARTBEAT
 void (*mach_heartbeat) (int);
 EXPORT_SYMBOL(mach_heartbeat);
diff --git a/arch/m68k/kernel/setup_no.c b/arch/m68k/kernel/setup_no.c
index 5e4104f07a44..00bf82258233 100644
--- a/arch/m68k/kernel/setup_no.c
+++ b/arch/m68k/kernel/setup_no.c
@@ -55,7 +55,6 @@ int (*mach_hwclk) (int, struct rtc_time*);
 /* machine dependent reboot functions */
 void (*mach_reset)(void);
 void (*mach_halt)(void);
-void (*mach_power_off)(void);
 
 #ifdef CONFIG_M68000
 #if defined(CONFIG_M68328)
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 5d16f9b47aa9..727320dedf08 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -12,6 +12,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -139,7 +140,6 @@ void __init config_mac(void)
mach_hwclk = mac_hwclk;
mach_reset = mac_reset;
mach_halt = mac_poweroff;
-   mach_power_off = mac_poweroff;
 #if IS_ENABLED(CONFIG_INPUT_M68K_BEEP)
mach_beep = mac_mksound;
 #endif
@@ -159,6 +159,8 @@ void __init config_mac(void)
 
if (macintosh_config->ident == MAC_MODEL_IICI)
mach_l2_flush = via_l2_flush;
+
+   register_platform_power_off(mac_poweroff);
 }
 
 
-- 
2.33.1




[PATCH v4 23/25] ACPI: power: Switch to sys-off handler API

2021-11-26 Thread Dmitry Osipenko
Switch to sys-off API that replaces legacy pm_power_off callbacks.

Signed-off-by: Dmitry Osipenko 
---
 drivers/acpi/sleep.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index eaa47753b758..2e613fddd614 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -47,19 +47,11 @@ static void acpi_sleep_tts_switch(u32 acpi_state)
}
 }
 
-static int tts_notify_reboot(struct notifier_block *this,
-   unsigned long code, void *x)
+static void tts_reboot_prepare(struct reboot_prep_data *data)
 {
acpi_sleep_tts_switch(ACPI_STATE_S5);
-   return NOTIFY_DONE;
 }
 
-static struct notifier_block tts_notifier = {
-   .notifier_call  = tts_notify_reboot,
-   .next   = NULL,
-   .priority   = 0,
-};
-
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
@@ -1020,7 +1012,7 @@ static void acpi_sleep_hibernate_setup(void)
 static inline void acpi_sleep_hibernate_setup(void) {}
 #endif /* !CONFIG_HIBERNATION */
 
-static void acpi_power_off_prepare(void)
+static void acpi_power_off_prepare(struct power_off_prep_data *data)
 {
/* Prepare to power off the system */
acpi_sleep_prepare(ACPI_STATE_S5);
@@ -1028,7 +1020,7 @@ static void acpi_power_off_prepare(void)
acpi_os_wait_events_complete();
 }
 
-static void acpi_power_off(void)
+static void acpi_power_off(struct power_off_data *data)
 {
/* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */
pr_debug("%s called\n", __func__);
@@ -1036,6 +1028,11 @@ static void acpi_power_off(void)
acpi_enter_sleep_state(ACPI_STATE_S5);
 }
 
+static struct sys_off_handler acpi_sys_off_handler = {
+   .power_off_priority = POWEROFF_PRIO_FIRMWARE,
+   .reboot_prepare_cb = tts_reboot_prepare,
+};
+
 int __init acpi_sleep_init(void)
 {
char supported[ACPI_S_STATE_COUNT * 3 + 1];
@@ -1052,8 +1049,8 @@ int __init acpi_sleep_init(void)
 
if (acpi_sleep_state_supported(ACPI_STATE_S5)) {
sleep_states[ACPI_STATE_S5] = 1;
-   pm_power_off_prepare = acpi_power_off_prepare;
-   pm_power_off = acpi_power_off;
+   acpi_sys_off_handler.power_off_cb = acpi_power_off;
+   acpi_sys_off_handler.power_off_prepare_cb = 
acpi_power_off_prepare;
} else {
acpi_no_s5 = true;
}
@@ -1069,6 +1066,6 @@ int __init acpi_sleep_init(void)
 * Register the tts_notifier to reboot notifier list so that the _TTS
 * object can also be evaluated when the system enters S5.
 */
-   register_reboot_notifier(&tts_notifier);
+   register_sys_off_handler(&acpi_sys_off_handler);
return 0;
 }
-- 
2.33.1




[PATCH v4 13/25] parisc: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Helge Deller  # parisc
Signed-off-by: Dmitry Osipenko 
---
 arch/parisc/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index ea3d83b6fb62..928201b1f58f 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -114,8 +115,7 @@ void machine_power_off(void)
pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN);
 
/* ipmi_poweroff may have been installed. */
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();

/* It seems we have no way to power the system off via
 * software. The user has to press the button himself. */
-- 
2.33.1




[PATCH v4 24/25] regulator: pfuze100: Use devm_register_sys_off_handler()

2021-11-26 Thread Dmitry Osipenko
Use devm_register_sys_off_handler() that replaces global
pm_power_off_prepare variable and allows to register multiple
power-off handlers.

Acked-by: Mark Brown 
Signed-off-by: Dmitry Osipenko 
---
 drivers/regulator/pfuze100-regulator.c | 38 ++
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/drivers/regulator/pfuze100-regulator.c 
b/drivers/regulator/pfuze100-regulator.c
index d60d7d1b7fa2..2eca8d43a097 100644
--- a/drivers/regulator/pfuze100-regulator.c
+++ b/drivers/regulator/pfuze100-regulator.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,6 +77,7 @@ struct pfuze_chip {
struct pfuze_regulator regulator_descs[PFUZE100_MAX_REGULATOR];
struct regulator_dev *regulators[PFUZE100_MAX_REGULATOR];
struct pfuze_regulator *pfuze_regulators;
+   struct sys_off_handler sys_off;
 };
 
 static const int pfuze100_swbst[] = {
@@ -569,10 +571,10 @@ static inline struct device_node *match_of_node(int index)
return pfuze_matches[index].of_node;
 }
 
-static struct pfuze_chip *syspm_pfuze_chip;
-
-static void pfuze_power_off_prepare(void)
+static void pfuze_power_off_prepare(struct power_off_prep_data *data)
 {
+   struct pfuze_chip *syspm_pfuze_chip = data->cb_data;
+
dev_info(syspm_pfuze_chip->dev, "Configure standby mode for power off");
 
/* Switch from default mode: APS/APS to APS/Off */
@@ -611,24 +613,23 @@ static void pfuze_power_off_prepare(void)
 
 static int pfuze_power_off_prepare_init(struct pfuze_chip *pfuze_chip)
 {
+   int err;
+
if (pfuze_chip->chip_id != PFUZE100) {
dev_warn(pfuze_chip->dev, "Requested pm_power_off_prepare 
handler for not supported chip\n");
return -ENODEV;
}
 
-   if (pm_power_off_prepare) {
-   dev_warn(pfuze_chip->dev, "pm_power_off_prepare is already 
registered.\n");
-   return -EBUSY;
-   }
+   pfuze_chip->sys_off.power_off_prepare_cb = pfuze_power_off_prepare;
+   pfuze_chip->sys_off.cb_data = pfuze_chip;
 
-   if (syspm_pfuze_chip) {
-   dev_warn(pfuze_chip->dev, "syspm_pfuze_chip is already set.\n");
-   return -EBUSY;
+   err = devm_register_sys_off_handler(pfuze_chip->dev, 
&pfuze_chip->sys_off);
+   if (err) {
+   dev_err(pfuze_chip->dev,
+   "failed to register sys-off handler: %d\n", err);
+   return err;
}
 
-   syspm_pfuze_chip = pfuze_chip;
-   pm_power_off_prepare = pfuze_power_off_prepare;
-
return 0;
 }
 
@@ -837,23 +838,12 @@ static int pfuze100_regulator_probe(struct i2c_client 
*client,
return 0;
 }
 
-static int pfuze100_regulator_remove(struct i2c_client *client)
-{
-   if (syspm_pfuze_chip) {
-   syspm_pfuze_chip = NULL;
-   pm_power_off_prepare = NULL;
-   }
-
-   return 0;
-}
-
 static struct i2c_driver pfuze_driver = {
.driver = {
.name = "pfuze100-regulator",
.of_match_table = pfuze_dt_ids,
},
.probe = pfuze100_regulator_probe,
-   .remove = pfuze100_regulator_remove,
 };
 module_i2c_driver(pfuze_driver);
 
-- 
2.33.1




[PATCH v4 14/25] xen/x86: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Juergen Gross 
Signed-off-by: Dmitry Osipenko 
---
 arch/x86/xen/enlighten_pv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5004feb16783..527fa545eb1f 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1068,8 +1069,7 @@ static void xen_machine_halt(void)
 
 static void xen_machine_power_off(void)
 {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
xen_reboot(SHUTDOWN_poweroff);
 }
 
-- 
2.33.1




[PATCH v4 20/25] mips: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Signed-off-by: Dmitry Osipenko 
---
 arch/mips/kernel/reset.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/mips/kernel/reset.c b/arch/mips/kernel/reset.c
index 6288780b779e..e7ce07b3e79b 100644
--- a/arch/mips/kernel/reset.c
+++ b/arch/mips/kernel/reset.c
@@ -114,8 +114,7 @@ void machine_halt(void)
 
 void machine_power_off(void)
 {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
 
 #ifdef CONFIG_SMP
preempt_disable();
-- 
2.33.1




[PATCH v4 22/25] memory: emif: Use kernel_can_power_off()

2021-11-26 Thread Dmitry Osipenko
Replace legacy pm_power_off with kernel_can_power_off() helper that
is aware about chained power-off handlers.

Signed-off-by: Dmitry Osipenko 
---
 drivers/memory/emif.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/memory/emif.c b/drivers/memory/emif.c
index 762d0c0f0716..cab10d5274a0 100644
--- a/drivers/memory/emif.c
+++ b/drivers/memory/emif.c
@@ -630,7 +630,7 @@ static irqreturn_t emif_threaded_isr(int irq, void *dev_id)
dev_emerg(emif->dev, "SDRAM temperature exceeds operating 
limit.. Needs shut down!!!\n");
 
/* If we have Power OFF ability, use it, else try restarting */
-   if (pm_power_off) {
+   if (kernel_can_power_off()) {
kernel_power_off();
} else {
WARN(1, "FIXME: NO pm_power_off!!! trying restart\n");
-- 
2.33.1




[PATCH v4 19/25] ia64: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Signed-off-by: Dmitry Osipenko 
---
 arch/ia64/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 834df24a88f1..cee4d7db2143 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -599,8 +600,7 @@ machine_halt (void)
 void
 machine_power_off (void)
 {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
machine_halt();
 }
 
-- 
2.33.1




[PATCH v4 15/25] powerpc: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Michael Ellerman 
Signed-off-by: Dmitry Osipenko 
---
 arch/powerpc/kernel/setup-common.c | 4 +---
 arch/powerpc/xmon/xmon.c   | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 4f1322b65760..71c4ccd9bbb1 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -161,9 +161,7 @@ void machine_restart(char *cmd)
 void machine_power_off(void)
 {
machine_shutdown();
-   if (pm_power_off)
-   pm_power_off();
-
+   do_kernel_power_off();
smp_send_stop();
machine_hang();
 }
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 83100c6524cc..759e167704e6 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1243,8 +1243,7 @@ static void bootcmds(void)
} else if (cmd == 'h') {
ppc_md.halt();
} else if (cmd == 'p') {
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
}
 }
 
-- 
2.33.1




[PATCH v4 11/25] riscv: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Palmer Dabbelt 
Signed-off-by: Dmitry Osipenko 
---
 arch/riscv/kernel/reset.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/riscv/kernel/reset.c b/arch/riscv/kernel/reset.c
index 9c842c41684a..912288572226 100644
--- a/arch/riscv/kernel/reset.c
+++ b/arch/riscv/kernel/reset.c
@@ -23,16 +23,12 @@ void machine_restart(char *cmd)
 
 void machine_halt(void)
 {
-   if (pm_power_off != NULL)
-   pm_power_off();
-   else
-   default_power_off();
+   do_kernel_power_off();
+   default_power_off();
 }
 
 void machine_power_off(void)
 {
-   if (pm_power_off != NULL)
-   pm_power_off();
-   else
-   default_power_off();
+   do_kernel_power_off();
+   default_power_off();
 }
-- 
2.33.1




[PATCH v4 10/25] csky: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Acked-by: Guo Ren 
Signed-off-by: Dmitry Osipenko 
---
 arch/csky/kernel/power.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/csky/kernel/power.c b/arch/csky/kernel/power.c
index 923ee4e381b8..86ee202906f8 100644
--- a/arch/csky/kernel/power.c
+++ b/arch/csky/kernel/power.c
@@ -9,16 +9,14 @@ EXPORT_SYMBOL(pm_power_off);
 void machine_power_off(void)
 {
local_irq_disable();
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
asm volatile ("bkpt");
 }
 
 void machine_halt(void)
 {
local_irq_disable();
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
asm volatile ("bkpt");
 }
 
-- 
2.33.1




[PATCH v4 09/25] ARM: Use do_kernel_power_off()

2021-11-26 Thread Dmitry Osipenko
Kernel now supports chained power-off handlers. Use do_kernel_power_off()
that invokes chained power-off handlers. It also invokes legacy
pm_power_off() for now, which will be removed once all drivers will
be converted to the new power-off API.

Reviewed-by: Russell King (Oracle) 
Signed-off-by: Dmitry Osipenko 
---
 arch/arm/kernel/reboot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c
index 3044fcb8d073..2cb943422554 100644
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -116,9 +116,7 @@ void machine_power_off(void)
 {
local_irq_disable();
smp_send_stop();
-
-   if (pm_power_off)
-   pm_power_off();
+   do_kernel_power_off();
 }
 
 /*
-- 
2.33.1




[PATCH v4 08/25] kernel: Add combined power-off+restart handler call chain API

2021-11-26 Thread Dmitry Osipenko
SoC platforms often have multiple ways of how to perform system's
power-off and restart operations. Meanwhile today's kernel is limited to
a single option. Add combined power-off+restart handler call chain API,
which is inspired by the restart API. The new API provides both power-off
and restart functionality.

The old pm_power_off method will be kept around till all users are
converted to the new API.

Current restart API will be replaced by the new unified API since
new API is its superset. The restart functionality of the sys-off handler
API is built upon the existing restart-notifier APIs.

In order to ease conversion to the new API, convenient helpers are added
for the common use-cases. They will reduce amount of boilerplate code and
remove global variables. These helpers preserve old behaviour for cases
where only one power-off handler is expected, this is what all existing
drivers want, and thus, they could be easily converted to the new API.
Users of the new API should explicitly enable power-off chaining by
setting corresponding flag of the power_handler structure.

Signed-off-by: Dmitry Osipenko 
---
 include/linux/reboot.h   | 265 ++-
 kernel/power/hibernate.c |   2 +-
 kernel/reboot.c  | 536 ++-
 3 files changed, 795 insertions(+), 8 deletions(-)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index b7fa25726323..76799bb3a560 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -8,10 +8,35 @@
 
 struct device;
 
-#define SYS_DOWN   0x0001  /* Notify of system down */
-#define SYS_RESTARTSYS_DOWN
-#define SYS_HALT   0x0002  /* Notify of system halt */
-#define SYS_POWER_OFF  0x0003  /* Notify of system power off */
+enum reboot_prepare_mode {
+   SYS_DOWN = 1,   /* Notify of system down */
+   SYS_RESTART = SYS_DOWN,
+   SYS_HALT,   /* Notify of system halt */
+   SYS_POWER_OFF,  /* Notify of system power off */
+};
+
+/*
+ * Standard restart priority levels. Intended to be set in the
+ * sys_off_handler.restart_priority field.
+ *
+ * Use `RESTART_PRIO_ABC +- prio` style for additional levels.
+ *
+ * RESTART_PRIO_RESERVED:  Falls back to RESTART_PRIO_DEFAULT.
+ * Drivers may leave priority initialized
+ * to zero, to auto-set it to the default level.
+ *
+ * RESTART_PRIO_LOW:   Use this for handler of last resort.
+ *
+ * RESTART_PRIO_DEFAULT:   Use this for default/generic handler.
+ *
+ * RESTART_PRIO_HIGH:  Use this if you have multiple handlers and
+ * this handler has higher priority than the
+ * default handler.
+ */
+#define RESTART_PRIO_RESERVED  0
+#define RESTART_PRIO_LOW   8
+#define RESTART_PRIO_DEFAULT   128
+#define RESTART_PRIO_HIGH  192
 
 enum reboot_mode {
REBOOT_UNDEFINED = -1,
@@ -49,6 +74,237 @@ int register_restart_handler(struct notifier_block *);
 int unregister_restart_handler(struct notifier_block *);
 void do_kernel_restart(char *cmd);
 
+/*
+ * System power-off and restart API.
+ */
+
+/*
+ * Standard power-off priority levels. Intended to be set in the
+ * sys_off_handler.power_off_priority field.
+ *
+ * Use `POWEROFF_PRIO_ABC +- prio` style for additional levels.
+ *
+ * POWEROFF_PRIO_RESERVED: Falls back to POWEROFF_PRIO_DEFAULT.
+ * Drivers may leave priority initialized
+ * to zero, to auto-set it to the default level.
+ *
+ * POWEROFF_PRIO_PLATFORM: Intended to be used by platform-level handler.
+ * Has lowest priority since device drivers are
+ * expected to take over platform handler which
+ * doesn't allow further callback chaining.
+ *
+ * POWEROFF_PRIO_DEFAULT:  Use this for default/generic handler.
+ *
+ * POWEROFF_PRIO_FIRMWARE: Use this if handler uses firmware call.
+ * Has highest priority since firmware is expected
+ * to know best how to power-off hardware properly.
+ */
+#define POWEROFF_PRIO_RESERVED 0
+#define POWEROFF_PRIO_PLATFORM 1
+#define POWEROFF_PRIO_DEFAULT  128
+#define POWEROFF_PRIO_HIGH 192
+#define POWEROFF_PRIO_FIRMWARE 224
+
+enum poweroff_mode {
+   POWEROFF_NORMAL = 0,
+   POWEROFF_PREPARE,
+};
+
+/**
+ * struct power_off_data - Power-off callback argument
+ *
+ * @cb_data: Callback data.
+ */
+struct power_off_data {
+   void *cb_data;
+};
+
+/**
+ * struct power_off_prep_data - Power-off preparation callback argument
+ *
+ * @cb_data: Callback data.
+ */
+struct power_off_prep_data {
+   void *cb_data;
+};
+
+/**
+ * struct restart_data - Restart callback argument
+ *
+ * @cb_data: Callback data.
+ * @cmd: Restart command string.
+ * @st

[PATCH v4 07/25] reboot: Remove extern annotation from function prototypes

2021-11-26 Thread Dmitry Osipenko
There is no need to annotate function prototypes with 'extern', it makes
code less readable. Remove unnecessary annotations from .

Signed-off-by: Dmitry Osipenko 
---
 include/linux/reboot.h | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 7c288013a3ca..b7fa25726323 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -40,36 +40,36 @@ extern int reboot_cpu;
 extern int reboot_force;
 
 
-extern int register_reboot_notifier(struct notifier_block *);
-extern int unregister_reboot_notifier(struct notifier_block *);
+int register_reboot_notifier(struct notifier_block *);
+int unregister_reboot_notifier(struct notifier_block *);
 
-extern int devm_register_reboot_notifier(struct device *, struct 
notifier_block *);
+int devm_register_reboot_notifier(struct device *, struct notifier_block *);
 
-extern int register_restart_handler(struct notifier_block *);
-extern int unregister_restart_handler(struct notifier_block *);
-extern void do_kernel_restart(char *cmd);
+int register_restart_handler(struct notifier_block *);
+int unregister_restart_handler(struct notifier_block *);
+void do_kernel_restart(char *cmd);
 
 /*
  * Architecture-specific implementations of sys_reboot commands.
  */
 
-extern void migrate_to_reboot_cpu(void);
-extern void machine_restart(char *cmd);
-extern void machine_halt(void);
-extern void machine_power_off(void);
+void migrate_to_reboot_cpu(void);
+void machine_restart(char *cmd);
+void machine_halt(void);
+void machine_power_off(void);
 
-extern void machine_shutdown(void);
+void machine_shutdown(void);
 struct pt_regs;
-extern void machine_crash_shutdown(struct pt_regs *);
+void machine_crash_shutdown(struct pt_regs *);
 
 /*
  * Architecture independent implementations of sys_reboot commands.
  */
 
-extern void kernel_restart_prepare(char *cmd);
-extern void kernel_restart(char *cmd);
-extern void kernel_halt(void);
-extern void kernel_power_off(void);
+void kernel_restart_prepare(char *cmd);
+void kernel_restart(char *cmd);
+void kernel_halt(void);
+void kernel_power_off(void);
 
 extern int C_A_D; /* for sysctl */
 void ctrl_alt_del(void);
@@ -77,15 +77,15 @@ void ctrl_alt_del(void);
 #define POWEROFF_CMD_PATH_LEN  256
 extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
 
-extern void orderly_poweroff(bool force);
-extern void orderly_reboot(void);
+void orderly_poweroff(bool force);
+void orderly_reboot(void);
 void hw_protection_shutdown(const char *reason, int ms_until_forced);
 
 /*
  * Emergency restart, callable from an interrupt handler.
  */
 
-extern void emergency_restart(void);
+void emergency_restart(void);
 #include 
 
 #endif /* _LINUX_REBOOT_H */
-- 
2.33.1




[PATCH v4 06/25] reboot: Warn if unregister_restart_handler() fails

2021-11-26 Thread Dmitry Osipenko
Emit warning if unregister_restart_handler() fails since it never should
fail. This will ease further API development by catching mistakes early.

Signed-off-by: Dmitry Osipenko 
---
 kernel/reboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/reboot.c b/kernel/reboot.c
index e6659ae329f1..f0e7b9c13f6b 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -210,7 +210,7 @@ EXPORT_SYMBOL(register_restart_handler);
  */
 int unregister_restart_handler(struct notifier_block *nb)
 {
-   return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+   return WARN_ON(atomic_notifier_chain_unregister(&restart_handler_list, 
nb));
 }
 EXPORT_SYMBOL(unregister_restart_handler);
 
-- 
2.33.1




[PATCH v4 05/25] reboot: Warn if restart handler has duplicated priority

2021-11-26 Thread Dmitry Osipenko
Add sanity check which ensures that there are no two restart handlers
registered with the same priority. Normally it's a direct sign of a
problem if two handlers use the same priority.

Signed-off-by: Dmitry Osipenko 
---
 kernel/reboot.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6bcc5d6a6572..e6659ae329f1 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -182,7 +182,20 @@ static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
  */
 int register_restart_handler(struct notifier_block *nb)
 {
-   return atomic_notifier_chain_register(&restart_handler_list, nb);
+   int ret;
+
+   ret = atomic_notifier_chain_register(&restart_handler_list, nb);
+   if (ret)
+   return ret;
+
+   /*
+* Handler must have unique priority. Otherwise call order is
+* determined by registration order, which is unreliable.
+*/
+   WARN(!atomic_notifier_has_unique_priority(&restart_handler_list, nb),
+"restart handler must have unique priority\n");
+
+   return 0;
 }
 EXPORT_SYMBOL(register_restart_handler);
 
-- 
2.33.1




[PATCH v4 04/25] reboot: Correct typo in a comment

2021-11-26 Thread Dmitry Osipenko
Correct s/implemenations/implementations/ in .

Signed-off-by: Dmitry Osipenko 
---
 include/linux/reboot.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index af907a3d68d1..7c288013a3ca 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -63,7 +63,7 @@ struct pt_regs;
 extern void machine_crash_shutdown(struct pt_regs *);
 
 /*
- * Architecture independent implemenations of sys_reboot commands.
+ * Architecture independent implementations of sys_reboot commands.
  */
 
 extern void kernel_restart_prepare(char *cmd);
-- 
2.33.1




[PATCH v4 03/25] notifier: Add atomic/blocking_notifier_has_unique_priority()

2021-11-26 Thread Dmitry Osipenko
Add atomic/blocking_notifier_has_unique_priority() helpers which return
true if given handler has unique priority.

Signed-off-by: Dmitry Osipenko 
---
 include/linux/notifier.h |  5 +++
 kernel/notifier.c| 69 
 2 files changed, 74 insertions(+)

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 924c9d7c8e73..2c4036f225e1 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -175,6 +175,11 @@ int raw_notifier_call_chain_robust(struct 
raw_notifier_head *nh,
 
 bool blocking_notifier_call_chain_is_empty(struct blocking_notifier_head *nh);
 
+bool atomic_notifier_has_unique_priority(struct atomic_notifier_head *nh,
+   struct notifier_block *nb);
+bool blocking_notifier_has_unique_priority(struct blocking_notifier_head *nh,
+   struct notifier_block *nb);
+
 #define NOTIFY_DONE0x  /* Don't care */
 #define NOTIFY_OK  0x0001  /* Suits me */
 #define NOTIFY_STOP_MASK   0x8000  /* Don't call further */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b20cb7b9b1f0..7a325b742104 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -122,6 +122,19 @@ static int notifier_call_chain_robust(struct 
notifier_block **nl,
return ret;
 }
 
+static int notifier_has_unique_priority(struct notifier_block **nl,
+   struct notifier_block *n)
+{
+   while (*nl && (*nl)->priority >= n->priority) {
+   if ((*nl)->priority == n->priority && *nl != n)
+   return false;
+
+   nl = &((*nl)->next);
+   }
+
+   return true;
+}
+
 /*
  * Atomic notifier chain routines.  Registration and unregistration
  * use a spinlock, and call_chain is synchronized by RCU (no locks).
@@ -203,6 +216,30 @@ int atomic_notifier_call_chain(struct atomic_notifier_head 
*nh,
 EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 NOKPROBE_SYMBOL(atomic_notifier_call_chain);
 
+/**
+ * atomic_notifier_has_unique_priority - Checks whether notifier's 
priority is unique
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: Entry in notifier chain to check
+ *
+ * Checks whether there is another notifier in the chain with the same 
priority.
+ * Must be called in process context.
+ *
+ * Returns true if priority is unique, false otherwise.
+ */
+bool atomic_notifier_has_unique_priority(struct atomic_notifier_head *nh,
+   struct notifier_block *n)
+{
+   unsigned long flags;
+   bool ret;
+
+   spin_lock_irqsave(&nh->lock, flags);
+   ret = notifier_has_unique_priority(&nh->head, n);
+   spin_unlock_irqrestore(&nh->lock, flags);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_has_unique_priority);
+
 /*
  * Blocking notifier chain routines.  All access to the chain is
  * synchronized by an rwsem.
@@ -336,6 +373,38 @@ bool blocking_notifier_call_chain_is_empty(struct 
blocking_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_is_empty);
 
+/**
+ * blocking_notifier_has_unique_priority - Checks whether notifier's 
priority is unique
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: Entry in notifier chain to check
+ *
+ * Checks whether there is another notifier in the chain with the same 
priority.
+ * Must be called in process context.
+ *
+ * Returns true if priority is unique, false otherwise.
+ */
+bool blocking_notifier_has_unique_priority(struct blocking_notifier_head *nh,
+   struct notifier_block *n)
+{
+   bool ret;
+
+   /*
+* This code gets used during boot-up, when task switching is
+* not yet working and interrupts must remain disabled. At such
+* times we must not call down_read().
+*/
+   if (system_state != SYSTEM_BOOTING)
+   down_read(&nh->rwsem);
+
+   ret = notifier_has_unique_priority(&nh->head, n);
+
+   if (system_state != SYSTEM_BOOTING)
+   up_read(&nh->rwsem);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_has_unique_priority);
+
 /*
  * Raw notifier chain routines.  There is no protection;
  * the caller must provide it.  Use at your own risk!
-- 
2.33.1




[PATCH v4 02/25] notifier: Add blocking_notifier_call_chain_is_empty()

2021-11-26 Thread Dmitry Osipenko
Add blocking_notifier_call_chain_is_empty() that returns true if call
chain is empty.

Signed-off-by: Dmitry Osipenko 
---
 include/linux/notifier.h |  2 ++
 kernel/notifier.c| 14 ++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 4b80a815b666..924c9d7c8e73 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -173,6 +173,8 @@ int blocking_notifier_call_chain_robust(struct 
blocking_notifier_head *nh,
 int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
unsigned long val_up, unsigned long val_down, void *v);
 
+bool blocking_notifier_call_chain_is_empty(struct blocking_notifier_head *nh);
+
 #define NOTIFY_DONE0x  /* Don't care */
 #define NOTIFY_OK  0x0001  /* Suits me */
 #define NOTIFY_STOP_MASK   0x8000  /* Don't call further */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b8251dc0bc0f..b20cb7b9b1f0 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -322,6 +322,20 @@ int blocking_notifier_call_chain(struct 
blocking_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 
+/**
+ * blocking_notifier_call_chain_is_empty - Check whether notifier chain is 
empty
+ * @nh: Pointer to head of the blocking notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool blocking_notifier_call_chain_is_empty(struct blocking_notifier_head *nh)
+{
+   return !rcu_access_pointer(nh->head);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_is_empty);
+
 /*
  * Raw notifier chain routines.  There is no protection;
  * the caller must provide it.  Use at your own risk!
-- 
2.33.1




[PATCH v4 01/25] notifier: Remove extern annotation from function prototypes

2021-11-26 Thread Dmitry Osipenko
There is no need to annotate function prototypes with 'extern', it makes
code less readable. Remove unnecessary annotations from .

Signed-off-by: Dmitry Osipenko 
---
 include/linux/notifier.h | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 87069b8459af..4b80a815b666 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -90,7 +90,7 @@ struct srcu_notifier_head {
} while (0)
 
 /* srcu_notifier_heads must be cleaned up dynamically */
-extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
+void srcu_init_notifier_head(struct srcu_notifier_head *nh);
 #define srcu_cleanup_notifier_head(name)   \
cleanup_srcu_struct(&(name)->srcu);
 
@@ -141,36 +141,36 @@ extern void srcu_init_notifier_head(struct 
srcu_notifier_head *nh);
 
 #ifdef __KERNEL__
 
-extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *nb);
-extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *nb);
-extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *nb);
-extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *nb);
 
-extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
struct notifier_block *nb);
-extern int blocking_notifier_chain_unregister(struct blocking_notifier_head 
*nh,
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
struct notifier_block *nb);
-extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
struct notifier_block *nb);
-extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
struct notifier_block *nb);
 
-extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v);
-extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
unsigned long val, void *v);
-extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v);
-extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
unsigned long val, void *v);
 
-extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head 
*nh,
+int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
unsigned long val_up, unsigned long val_down, void *v);
-extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
unsigned long val_up, unsigned long val_down, void *v);
 
 #define NOTIFY_DONE0x  /* Don't care */
-- 
2.33.1




[PATCH v4 00/25] Introduce power-off+restart call chain API

2021-11-26 Thread Dmitry Osipenko
Problem
---

SoC devices require power-off call chaining functionality from kernel.
We have a widely used restart chaining provided by restart notifier API,
but nothing for power-off.

Solution


Introduce new API that provides both restart and power-off call chains.

Why combine restart with power-off? Because drivers often do both.
More practical to have API that provides both under the same roof.

The new API is designed with simplicity and extensibility in mind.
It's built upon the existing restart and reboot APIs. The simplicity
is in new helper functions that are convenient for drivers. The
extensibility is in the design that doesn't hardcode callback
arguments, making easy to add new parameters and remove old.

This is a third attempt to introduce the new API. First was made by
Guenter Roeck back in 2014, second was made by Thierry Reding in 2017.
In fact the work didn't stop and recently arm_pm_restart() was removed
from v5.14 kernel, which was a part of preparatory work started by
Guenter Roeck. I took into account experience and ideas from the
previous attempts, extended and polished them.

Adoption plan
-

This patchset introduces the new API. It also converts multiple drivers
and arch code to the new API to demonstrate how it all looks in practice.

The plan is:

1. Merge new API (patches 1-8). This API will co-exist with the old APIs.

2. Convert arch code to do_kernel_power_off() (patches 9-21).

3. Convert drivers and platform code to the new API.

4. Remove obsolete pm_power_off and pm_power_off_prepare variables.

5. Make restart-notifier API private to kernel/reboot.c once no users left.

It's fully implemented here:

[1] https://github.com/grate-driver/linux/commits/sys-off-handler

For now I'm sending only the first 25 base patches out of ~180. It's
preferable to squash 1-2, partially 3 and 4 points of the plan into a
single patchset to ease and speed up applying of the rest of the patches.
Majority of drivers and platform patches depend on the base, hence they
will come later (and per subsystem), once base will land.

All [1] patches are compile-tested. Tegra and x86 ACPI patches are tested
on hardware. The remaining should be covered by unit tests (unpublished).

Results
---

1. Devices can be powered off properly.

2. Global variables are removed from drivers.

3. Global pm_power_off and pm_power_off_prepare callback variables are
removed once all users are converted to the new API. The latter callback
is removed by patch #25 of this series.

4. Ambiguous call chain ordering is prohibited. See patch #5 which adds
verification of restart handlers priorities, ensuring that they are unique.

Changelog:

v4: - Made a very minor improvement to doc comments, clarifying couple
  default values.

- Corrected list of emails recipient by adding Linus, Sebastian,
  Philipp and more NDS people. Removed bouncing emails.

- Added acks that were given to v3.

v3: - Renamed power_handler to sys_off_handler as was suggested by
  Rafael Wysocki.

- Improved doc-comments as was suggested by Rafael Wysocki. Added more
  doc-comments.

- Implemented full set of 180 patches which convert whole kernel in
  accordance to the plan, see link [1] above. Slightly adjusted API to
  better suit for the remaining converted drivers.

  * Added unregister_sys_off_handler() that is handy for a couple old
platform drivers.

  * Dropped devm_register_trivial_restart_handler(), 'simple' variant
is enough to have.

- Improved "Add atomic/blocking_notifier_has_unique_priority()" patch,
  as was suggested by Andy Shevchenko. Also replaced down_write() with
  down_read() and factored out common notifier_has_unique_priority().

- Added stop_chain field to struct restart_data and reboot_prep_data
  after discovering couple drivers wanting that feature.

- Added acks that were given to v2.

v2: - Replaced standalone power-off call chain demo-API with the combined
  power-off+restart API because this is what drivers want. It's a more
  comprehensive solution.

- Converted multiple drivers and arch code to the new API. Suggested by
  Andy Shevchenko. I skimmed through the rest of drivers, verifying that
  new API suits them. The rest of the drivers will be converted once we
  will settle on the new API, otherwise will be too many patches here.

- v2 API doesn't expose notifier to users and require handlers to
  have unique priority. Suggested by Guenter Roeck.

- v2 API has power-off chaining disabled by default and require
  drivers to explicitly opt-in to the chaining. This preserves old
  behaviour for existing drivers once they are converted to the new
  API.

Dmitry Osipenko (25):
  notifier: Remove extern annotation from function prototypes
  notifier: Add blocking_notifier_call_chain_is_empty()
  notifier: Add atomic/blocking_notifier_has_unique_priority()
  re

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions

2021-11-26 Thread Andre Przywara
On Fri, 26 Nov 2021 15:28:06 +
Ayan Kumar Halder  wrote:

Hi Ayan,

> Many thanks for your inputs.
> Apologies if I sound dumb, but I need a few clarifications.
> 
> On 26/11/2021 13:14, Andre Przywara wrote:
> > On Fri, 19 Nov 2021 16:52:02 +
> > Ayan Kumar Halder  wrote:
> > 
> > Hi,
> >   
> >> At present, post indexing instructions are not emulated by Xen.
> >> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> >> result, data abort is triggered.
> >>
> >> Added the logic to decode ldr/str post indexing instructions.
> >> With this, Xen can decode instructions like these:-
> >> ldr w2, [x1], #4
> >> Thus, domU can read ioreg with post indexing instructions.  
> > 
> > Where do those instructions come from? A (C) compiler? (Some mail in
> > another thread from Stefano suggests so)
> > If yes, I would argue that is broken:
> > IIUC C compilers assume normal memory attributes for every pointer they
> > handle, so they are free to use unaligned accesses, load/store exclusives,
> > split accesses (two halfword reads) and what not when generating code.
> > The GIC needs to be mapped as device memory, which explicitly forbids
> > unaligned accesses and exclusives (as in: always traps), so you cannot let
> > compiler-generated code access the GIC (or most other MMIO devices, for
> > that matter).
> > I know, this somewhat works(TM) in practise, because a uint32_t assignment
> > is very likely to end up in an ldr/str, but please let me know which car
> > this code ends up in, so that can I avoid this brand ;-)
> > 
> > You can tell the compiler to avoid unaligned accesses with -mstrict-align
> > (and should definitely do so when you are running C code with the MMU
> > off), but that still leaves exclusives and split accesses at the
> > compiler's discretion. A variation on the topic of split access is merged
> > writes, where the compiler uses NEON or SVE instructions, for instance, to
> > cover multiple words at once, possibly via some memset()/memcpy() routine.  
> 
> I understand that we should be using inline assembly instructions to 
> access any MMIO region. This is to prevent the compiler doing any tricks.
> 
> But is there a restriction that post indexing instructions can never be 
> used to access MMIO region ?

No, this is a pure virtualisation restriction, see below. On real
hardware/bare-metal, ldr/str with post or pre-indexing works and is fine
to use for MMIO.
But we need to have the right access width, matching the MMIO device's
expectation. So ldp/stp would probably be problematic, for instance.

> > On top there is this architectural restriction of the ARMv7/v8
> > virtualisation extension to not decode many "advanced" load/store
> > instructions in ESR_EL2.  
> Where do I find this restriction ?

That's described in the ESR_ELx syndrome description in the ARMv8 ARM (DDI
0487G.b), section "ISS encoding for an exception from a Data Abort" (page
D13-3219 in my Issue G.b copy):
"For other faults reported in ESR_EL2, ISV is 0 except for the following stage 
2 aborts: "

> Are you telling me that load/store with post indexing is an "advanced" 
> instruction and ArmV8 does not allow decoding of these instructions in 
> ESR_EL2 ?

Yes, it is in the group of instructions for which the hardware does not
provide syndrome information in ESR_EL2: "  but excluding Load
Exclusive or Store Exclusive and excluding those with writeback)."

> Isn't that a very strong limitation ?

I don't know about that, it's what it is and what it was for years. Linux
deliberately chose ldr/str only for readl/writel to be able to trap and
handle MMIO aborts in hypervisors.

If you do MMIO accesses the right way, using (inline) assembly only, then
you don't have the problem, and also avoid many others, see my previous
mail.

If you think of it from an architectural and implementation point of view
(and keep the RISC idea in mind): it should happen rarely, but would
require many gates for something that you can do in software as well.

> Also what is your opinion on Xen decoding these instructions ?

I would be careful, we deliberately avoid this in KVM. This bubbles up
from time to time, though, so we now allow delegating this case to
userland, so the VMM can do the decoding there.
In Xen you have less issues with walking the guest's page tables,
though (a major problem in KVM), but it still adds complexity to a
hypervisor which aims to be lean by design.
Another argument would be that just post/pre does not cover everything, and
the cases start to pile up quickly: what about the immediate versions,
ldxr, stp, NEON/SVE load/stores, etc. Since many of those are not safe for
MMIO anyway, you add a lot of code for little use (and which gets little
testing!).

Cheers,
Andre

> > Linux deliberately coded readl/writel using inline assembly, to only
> > use instructions that provide syndrome information, plus guarantee
> > device-memory compatible semantics.
> > Check out https://lwn.net/Articles/698014/

Re: [PATCH] public: add RING_NR_UNCONSUMED_*() macros to ring.h

2021-11-26 Thread Manuel Bouyer
On Fri, Nov 26, 2021 at 07:55:47AM +0100, Juergen Gross wrote:
> Today RING_HAS_UNCONSUMED_*() macros are returning the number of
> unconsumed requests or responses instead of a boolean as the name of
> the macros would imply.
> 
> As this "feature" is already being used, rename the macros to
> RING_NR_UNCONSUMED_*() and define the RING_HAS_UNCONSUMED_*() macros
> by using the new RING_NR_UNCONSUMED_*() macros. In order to avoid
> future misuse let RING_HAS_UNCONSUMED_*() really return a boolean.
> 
> Note that the known misuses need to be switched to the new
> RING_NR_UNCONSUMED_*() macros when using this version of ring.h.

AFAIK NetBSD is using RING_HAS_UNCONSUMED as a booleanm so it should
be fine with this change

-- 
Manuel Bouyer 
 NetBSD: 26 ans d'experience feront toujours la difference
--



[xen-unstable test] 166378: regressions - FAIL

2021-11-26 Thread osstest service owner
flight 166378 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/166378/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-pvshim   20 guest-localmigrate/x10   fail REGR. vs. 166304
 build-amd64-prev  6 xen-buildfail REGR. vs. 166304

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds 20 guest-localmigrate/x10   fail REGR. vs. 166304

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-migrupgrade  1 build-check(1)   blocked  n/a
 test-amd64-i386-migrupgrade   1 build-check(1)   blocked  n/a
 test-amd64-amd64-xl-qemut-debianhvm-i386-xsm 12 debian-hvm-install fail like 
166214
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 166304
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 166304
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 166304
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 166304
 test-amd64-i386-xl-qemut-ws16-amd64 19 guest-stop fail like 166304
 test-amd64-i386-xl-qemut-win7-amd64 19 guest-stop fail like 166304
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 166304
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 166304
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 166304
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 166304
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 166304
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 166304
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-armhf-armhf-xl-vh

[PATCH v1.1 64/65] x86/efi: Disable CET-IBT around Runtime Services calls

2021-11-26 Thread Andrew Cooper
At least one TigerLake NUC has UEFI firmware which isn't CET-IBT compatible.
Read under a function pointer to see whether an endbr64 instruction is
present, and use this as a heuristic.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 

This was disappointing to discover.  I've pestered some folk and maybe
something will improve in due course, but it remains an open question how best
to discover that Runtime Services are CET-IBT compatible.

v2:
 * Switch to endbr helpers.
---
 xen/arch/x86/efi/stub.c  |  2 ++
 xen/common/efi/boot.c|  9 +
 xen/common/efi/runtime.c | 17 +
 xen/include/xen/efi.h|  1 +
 4 files changed, 29 insertions(+)

diff --git a/xen/arch/x86/efi/stub.c b/xen/arch/x86/efi/stub.c
index 998493262641..5e44913e52db 100644
--- a/xen/arch/x86/efi/stub.c
+++ b/xen/arch/x86/efi/stub.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 
+bool __initdata efi_no_cet_ibt;
+
 /*
  * Here we are in EFI stub. EFI calls are not supported due to lack
  * of relevant functionality in compiler and/or linker.
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index f5af71837d5a..c19f993af922 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #ifdef CONFIG_X86
+#include 
 /*
  * Keep this arch-specific modified include in the common file, as moving
  * it to the arch specific include file would obscure that special care is
@@ -735,6 +736,14 @@ static void __init efi_init(EFI_HANDLE ImageHandle, 
EFI_SYSTEM_TABLE *SystemTabl
 
 StdOut = SystemTable->ConOut;
 StdErr = SystemTable->StdErr ?: StdOut;
+
+#ifdef CONFIG_X86
+/*
+ * Heuristic.  Look under an arbitrary function pointer to see if UEFI was
+ * compiled with CET-IBT support.  Experimentally some are not.
+ */
+efi_no_cet_ibt = !is_endbr64(efi_rs->GetTime);
+#endif
 }
 
 static void __init efi_console_set_mode(void)
diff --git a/xen/common/efi/runtime.c b/xen/common/efi/runtime.c
index d2fdc28df3e0..ef54863542db 100644
--- a/xen/common/efi/runtime.c
+++ b/xen/common/efi/runtime.c
@@ -21,6 +21,7 @@ struct efi_rs_state {
   * don't strictly need that.
   */
  unsigned long __aligned(32) cr3;
+unsigned long msr_s_cet;
 #endif
 };
 
@@ -61,6 +62,7 @@ UINTN __read_mostly efi_apple_properties_len;
 
 /* Bit field representing available EFI features/properties. */
 unsigned int efi_flags;
+bool __read_mostly efi_no_cet_ibt;
 
 struct efi __read_mostly efi = {
.acpi   = EFI_INVALID_TABLE_ADDR,
@@ -113,6 +115,17 @@ struct efi_rs_state efi_rs_enter(void)
 
 switch_cr3_cr4(mfn_to_maddr(efi_l4_mfn), read_cr4());
 
+/*
+ * If UEFI doesn't appear to be CET-IBT compatible, stash and clobber
+ * ENDBR_EN.  Always read the current CET setting, because CET-SS isn't
+ * configured until very late on the BSP.
+ */
+if ( cpu_has_xen_ibt && efi_no_cet_ibt )
+{
+rdmsrl(MSR_S_CET, state.msr_s_cet);
+wrmsrl(MSR_S_CET, state.msr_s_cet & ~CET_ENDBR_EN);
+}
+
 return state;
 }
 
@@ -122,6 +135,10 @@ void efi_rs_leave(struct efi_rs_state *state)
 
 if ( !state->cr3 )
 return;
+
+if ( state->msr_s_cet )
+wrmsrl(MSR_S_CET, state->msr_s_cet);
+
 switch_cr3_cr4(state->cr3, read_cr4());
 if ( is_pv_vcpu(curr) && !is_idle_vcpu(curr) )
 {
diff --git a/xen/include/xen/efi.h b/xen/include/xen/efi.h
index 94a7e547f97b..8c14f7f18718 100644
--- a/xen/include/xen/efi.h
+++ b/xen/include/xen/efi.h
@@ -30,6 +30,7 @@ union compat_pf_efi_info;
 
 struct xenpf_efi_runtime_call;
 struct compat_pf_efi_runtime_call;
+extern bool efi_no_cet_ibt;
 
 bool efi_enabled(unsigned int feature);
 void efi_init_memory(void);
-- 
2.11.0




[PATCH v1.1 61/65] x86/entry: Make syscall/sysenter entrypoints CET-IBT compatible

2021-11-26 Thread Andrew Cooper
Each of MSR_{L,C}STAR and MSR_SYSENTER_EIP need to land on an endbr64
instruction.  For sysenter, this is easy.

Unfortunately for syscall, the stubs are already 29 byte long with a limit of
32.  endbr64 is 4 bytes.  Luckily, there is a 1 byte instruction which can
move from the stubs into the main handlers.

Move the push %rax out of the stub and into {l,c}star_entry(), allowing room
for the endbr64 instruction when appropriate.  Update the comment describing
the entry state.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 

v2:
 * Update to use endbr helpers.
---
 xen/arch/x86/x86_64/entry.S | 18 +-
 xen/arch/x86/x86_64/traps.c | 11 +++
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 8494b97a54a2..9abcf95bd010 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -241,18 +241,17 @@ iret_exit_to_guest:
  * When entering SYSCALL from user mode:
  *  Vector directly to the registered arch.syscall_addr.
  *
- * Initial work is done by per-CPU trampolines. At this point %rsp has been
- * initialised to point at the correct Xen stack, %rsp has been saved, and
- * %rax needs to be restored from the %ss save slot. All other registers are
- * still to be saved onto the stack, starting with RFLAGS, and an appropriate
- * %ss must be saved into the space left by the trampoline.
+ * Initial work is done by per-CPU trampolines.
+ *  - Guest %rax stored in the %ss slot
+ *  - Guest %rsp stored in %rax
+ *  - Xen stack loaded, pointing at the %ss slot
  */
 ENTRY(lstar_enter)
 #ifdef CONFIG_XEN_SHSTK
 ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
 #endif
-/* sti could live here when we don't switch page tables below. */
-movq  8(%rsp),%rax /* Restore %rax. */
+push  %rax  /* Guest %rsp */
+movq  8(%rsp), %rax /* Restore guest %rax */
 movq  $FLAT_KERNEL_SS,8(%rsp)
 pushq %r11
 pushq $FLAT_KERNEL_CS64
@@ -288,9 +287,9 @@ ENTRY(cstar_enter)
 #ifdef CONFIG_XEN_SHSTK
 ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
 #endif
-/* sti could live here when we don't switch page tables below. */
+push  %rax  /* Guest %rsp */
 CR4_PV32_RESTORE
-movq  8(%rsp), %rax /* Restore %rax. */
+movq  8(%rsp), %rax /* Restore guest %rax. */
 movq  $FLAT_USER_SS32, 8(%rsp) /* Assume a 64bit domain.  Compat 
handled lower. */
 pushq %r11
 pushq $FLAT_USER_CS32
@@ -323,6 +322,7 @@ ENTRY(cstar_enter)
 jmp   switch_to_kernel
 
 ENTRY(sysenter_entry)
+ENDBR64
 #ifdef CONFIG_XEN_SHSTK
 ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK
 #endif
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 6f3c65bedc7a..ab66515a3c75 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -295,6 +296,12 @@ static unsigned int write_stub_trampoline(
 {
 unsigned char *p = stub;
 
+if ( cpu_has_xen_ibt )
+{
+place_endbr64(p);
+p += 4;
+}
+
 /* Store guest %rax into %ss slot */
 /* movabsq %rax, stack_bottom - 8 */
 *p++ = 0x48;
@@ -315,10 +322,6 @@ static unsigned int write_stub_trampoline(
 *(uint64_t *)p = stack_bottom - 8;
 p += 8;
 
-/* Store guest %rsp into %rsp slot */
-/* pushq %rax */
-*p++ = 0x50;
-
 /* jmp target_va */
 *p++ = 0xe9;
 *(int32_t *)p = target_va - (stub_va + (p - stub) + 4);
-- 
2.11.0




[PATCH v1.1 60/65] x86/emul: Update emulation stubs to be CET-IBT compatible

2021-11-26 Thread Andrew Cooper
All indirect branches need to land on an endbr64 instruction.

For stub_selftests(), use endbr64 unconditionally for simplicity.  For ioport
and instruction emulation, add endbr64 conditionally.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 

v2:
 * Update to use endbr helpers
---
 xen/arch/x86/extable.c | 14 +-
 xen/arch/x86/pv/emul-priv-op.c |  7 +++
 xen/arch/x86/x86_emulate.c | 13 +++--
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/xen/arch/x86/extable.c b/xen/arch/x86/extable.c
index 4aa1ab4b2a45..25c6fda00d28 100644
--- a/xen/arch/x86/extable.c
+++ b/xen/arch/x86/extable.c
@@ -129,19 +129,23 @@ search_exception_table(const struct cpu_user_regs *regs)
 static int __init cf_check stub_selftest(void)
 {
 static const struct {
-uint8_t opc[4];
+uint8_t opc[8];
 uint64_t rax;
 union stub_exception_token res;
 } tests[] __initconst = {
-{ .opc = { 0x0f, 0xb9, 0xc3, 0xc3 }, /* ud1 */
+{ .opc = { 0xf3, 0x0f, 0x1e, 0xfa,   /* endbr64 */
+   0x0f, 0xb9, 0xc3, 0xc3 }, /* ud1 */
   .res.fields.trapnr = TRAP_invalid_op },
-{ .opc = { 0x90, 0x02, 0x00, 0xc3 }, /* nop; add (%rax),%al */
+{ .opc = { 0xf3, 0x0f, 0x1e, 0xfa,   /* endbr64 */
+   0x90, 0x02, 0x00, 0xc3 }, /* nop; add (%rax),%al */
   .rax = 0x0123456789abcdef,
   .res.fields.trapnr = TRAP_gp_fault },
-{ .opc = { 0x02, 0x04, 0x04, 0xc3 }, /* add (%rsp,%rax),%al */
+{ .opc = { 0xf3, 0x0f, 0x1e, 0xfa,   /* endbr64 */
+   0x02, 0x04, 0x04, 0xc3 }, /* add (%rsp,%rax),%al */
   .rax = 0xfedcba9876543210,
   .res.fields.trapnr = TRAP_stack_error },
-{ .opc = { 0xcc, 0xc3, 0xc3, 0xc3 }, /* int3 */
+{ .opc = { 0xf3, 0x0f, 0x1e, 0xfa,   /* endbr64 */
+   0xcc, 0xc3, 0xc3, 0xc3 }, /* int3 */
   .res.fields.trapnr = TRAP_int3 },
 };
 unsigned long addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2;
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 808ff1873352..e35cb4b8669d 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -26,6 +26,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,6 +112,12 @@ static io_emul_stub_t *io_emul_stub_setup(struct 
priv_op_ctxt *ctxt, u8 opcode,
 
 p = ctxt->io_emul_stub;
 
+if ( cpu_has_xen_ibt )
+{
+place_endbr64(p);
+p += 4;
+}
+
 APPEND_BUFF(prologue);
 APPEND_CALL(load_guest_gprs);
 
diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
index 60191a94dc18..720740f29b84 100644
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -17,6 +17,7 @@
 #include 
 #include  /* cpu_has_amd_erratum() */
 #include 
+#include 
 
 /* Avoid namespace pollution. */
 #undef cmpxchg
@@ -29,11 +30,19 @@
 cpu_has_amd_erratum(¤t_cpu_data, AMD_ERRATUM_##nr)
 
 #define get_stub(stb) ({\
+void *ptr;  \
 BUILD_BUG_ON(STUB_BUF_SIZE / 2 < MAX_INST_LEN + 1); \
 ASSERT(!(stb).ptr); \
 (stb).addr = this_cpu(stubs.addr) + STUB_BUF_SIZE / 2;  \
-memset(((stb).ptr = map_domain_page(_mfn(this_cpu(stubs.mfn +  \
-   ((stb).addr & ~PAGE_MASK), 0xcc, STUB_BUF_SIZE / 2);\
+(stb).ptr = map_domain_page(_mfn(this_cpu(stubs.mfn))) +\
+((stb).addr & ~PAGE_MASK);  \
+ptr = memset((stb).ptr, 0xcc, STUB_BUF_SIZE / 2);   \
+if ( cpu_has_xen_ibt )  \
+{   \
+place_endbr64(ptr); \
+ptr += 4;   \
+}   \
+ptr;\
 })
 #define put_stub(stb) ({   \
 if ( (stb).ptr )   \
-- 
2.11.0




[PATCH 59.5/65] x86: Introduce helpers/checks for endbr64 instructions

2021-11-26 Thread Andrew Cooper
... to prevent the optimiser creating unsafe code.  See the code comment for
full details.

Also add a build time check for endbr64 embedded in imm32 operands, which
catches the obvious cases where the optimiser has done an unsafe thing.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/arch/x86/Makefile   |  4 
 xen/include/asm-x86/endbr.h | 55 +
 2 files changed, 59 insertions(+)
 create mode 100644 xen/include/asm-x86/endbr.h

diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 69b6cfaded25..64a5c0d20018 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -190,6 +190,10 @@ $(TARGET)-syms: prelink.o xen.lds
$(MAKE) -f $(BASEDIR)/Rules.mk efi-y= $(@D)/.$(@F).1.o
$(LD) $(XEN_LDFLAGS) -T xen.lds -N prelink.o $(build_id_linker) \
$(@D)/.$(@F).1.o -o $@
+ifeq ($(CONFIG_XEN_IBT),y)
+   $(OBJDUMP) -d $@ | grep 0xfa1e0ff3 >/dev/null && \
+   { echo "Found embedded endbr64 instructions" >&2; false; } || :
+endif
$(NM) -pa --format=sysv $(@D)/$(@F) \
| $(BASEDIR)/tools/symbols --all-symbols --xensyms --sysv 
--sort \
>$(@D)/$(@F).map
diff --git a/xen/include/asm-x86/endbr.h b/xen/include/asm-x86/endbr.h
new file mode 100644
index ..47f766024c12
--- /dev/null
+++ b/xen/include/asm-x86/endbr.h
@@ -0,0 +1,55 @@
+/**
+ * include/asm-x86/endbr.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see .
+ *
+ * Copyright (c) 2021 Citrix Systems Ltd.
+ */
+#ifndef XEN_ASM_ENDBR_H
+#define XEN_ASM_ENDBR_H
+
+#include 
+
+/*
+ * In some cases we need to inspect/insert endbr64 instructions.
+ *
+ * The naive way, mem{cmp,cpy}(ptr, "\xf3\x0f\x1e\xfa", 4), optimises unsafely
+ * by placing 0xfa1e0ff3 in an imm32 operand, which marks a legal indirect
+ * branch target as far as the CPU is concerned.
+ *
+ * gen_endbr64() is written deliberately to avoid the problematic operand, and
+ * marked __const__ as it is safe for the optimiser to hoist/merge/etc.
+ */
+static inline uint32_t __attribute_const__ gen_endbr64(void)
+{
+uint32_t res;
+
+asm ( "mov $~0xfa1e0ff3, %[res]\n\t"
+  "not %[res]\n\t"
+  : [res] "=r" (res) );
+
+return res;
+}
+
+static inline bool is_endbr64(const void *ptr)
+{
+return *(const uint32_t *)ptr == gen_endbr64();
+}
+
+static inline void place_endbr64(void *ptr)
+{
+*(uint32_t *)ptr = gen_endbr64();
+}
+
+#endif /* XEN_ASM_ENDBR_H */
-- 
2.11.0




[GIT PULL] xen: branch for v5.16-rc3

2021-11-26 Thread Juergen Gross
Linus,

Please git pull the following tag:

 git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git 
for-linus-5.16c-rc3-tag

xen: branch for v5.16-rc3

It contains the following changes:
- a Xen related Kconfig fix for making it possible to control building
  of the privcmd driver
- 3 patches for fixing issues identified by the kernel test robot
- a 5 patch series for simplifying timeout handling for Xen PV driver
  initialization
- 2 patches for fixing error paths in xenstore/xenbus driver initialization

Thanks.

Juergen

 arch/x86/include/asm/xen/hypercall.h   |  4 ++--
 arch/x86/include/asm/xen/hypervisor.h  |  1 +
 drivers/gpu/drm/xen/xen_drm_front.c|  1 +
 drivers/input/misc/xen-kbdfront.c  |  1 +
 drivers/tty/hvc/hvc_xen.c  |  1 +
 drivers/video/fbdev/xen-fbfront.c  |  1 +
 drivers/xen/Kconfig|  8 +++-
 drivers/xen/pvcalls-front.c|  1 +
 drivers/xen/xenbus/xenbus_probe.c  | 27 ++-
 drivers/xen/xenbus/xenbus_probe_frontend.c | 14 +++---
 include/xen/xenbus.h   |  1 +
 sound/xen/xen_snd_front.c  |  1 +
 12 files changed, 46 insertions(+), 15 deletions(-)

Juergen Gross (9):
  xen/privcmd: make option visible in Kconfig
  xen/pvh: add missing prototype to header
  xen: add "not_essential" flag to struct xenbus_driver
  xen: flag xen_drm_front to be not essential for system boot
  xen: flag hvc_xen to be not essential for system boot
  xen: flag pvcalls-front to be not essential for system boot
  xen: flag xen_snd_front to be not essential for system boot
  xen: make HYPERVISOR_get_debugreg() always_inline
  xen: make HYPERVISOR_set_debugreg() always_inline

Stefano Stabellini (2):
  xen: don't continue xenstore initialization in case of errors
  xen: detect uninitialized xenbus in xenbus_init



Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions

2021-11-26 Thread Ayan Kumar Halder

Hi Andre,

Many thanks for your inputs.
Apologies if I sound dumb, but I need a few clarifications.

On 26/11/2021 13:14, Andre Przywara wrote:

On Fri, 19 Nov 2021 16:52:02 +
Ayan Kumar Halder  wrote:

Hi,


At present, post indexing instructions are not emulated by Xen.
When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
result, data abort is triggered.

Added the logic to decode ldr/str post indexing instructions.
With this, Xen can decode instructions like these:-
ldr w2, [x1], #4
Thus, domU can read ioreg with post indexing instructions.


Where do those instructions come from? A (C) compiler? (Some mail in
another thread from Stefano suggests so)
If yes, I would argue that is broken:
IIUC C compilers assume normal memory attributes for every pointer they
handle, so they are free to use unaligned accesses, load/store exclusives,
split accesses (two halfword reads) and what not when generating code.
The GIC needs to be mapped as device memory, which explicitly forbids
unaligned accesses and exclusives (as in: always traps), so you cannot let
compiler-generated code access the GIC (or most other MMIO devices, for
that matter).
I know, this somewhat works(TM) in practise, because a uint32_t assignment
is very likely to end up in an ldr/str, but please let me know which car
this code ends up in, so that can I avoid this brand ;-)

You can tell the compiler to avoid unaligned accesses with -mstrict-align
(and should definitely do so when you are running C code with the MMU
off), but that still leaves exclusives and split accesses at the
compiler's discretion. A variation on the topic of split access is merged
writes, where the compiler uses NEON or SVE instructions, for instance, to
cover multiple words at once, possibly via some memset()/memcpy() routine.


I understand that we should be using inline assembly instructions to 
access any MMIO region. This is to prevent the compiler doing any tricks.


But is there a restriction that post indexing instructions can never be 
used to access MMIO region ?




On top there is this architectural restriction of the ARMv7/v8
virtualisation extension to not decode many "advanced" load/store
instructions in ESR_EL2.

Where do I find this restriction ?

Are you telling me that load/store with post indexing is an "advanced" 
instruction and ArmV8 does not allow decoding of these instructions in 
ESR_EL2 ? Isn't that a very strong limitation ?


Also what is your opinion on Xen decoding these instructions ?

- Ayan


Linux deliberately coded readl/writel using inline assembly, to only use
instructions that provide syndrome information, plus guarantee
device-memory compatible semantics.
Check out https://lwn.net/Articles/698014/ for a comprehensive discussion
of this whole MMIO topic.

So I think you should do the same in your guest/bare metal code: define
{read,write}{b,h,l,q} as inline assembly functions, using ldr?/str? only.
See xen/include/asm-arm/arm64/io.h for an example that uses static inline
functions in a header file, to generate most optimal code. Then always do
MMIO only via those accessors. That prevents any future compiler
surprises, plus makes you perfectly virtualisable.

Cheers,
Andre.


Signed-off-by: Ayan Kumar Halder 
---
Note to reviewer:-
This patch is based on an issue discussed in
https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
"Xen/ARM - Query about a data abort seen while reading GICD registers"


  xen/arch/arm/decode.c | 77 +++
  xen/arch/arm/io.c | 14 ++--
  2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
index 792c2e92a7..7b60bedbc5 100644
--- a/xen/arch/arm/decode.c
+++ b/xen/arch/arm/decode.c
@@ -84,6 +84,80 @@ bad_thumb2:
  return 1;
  }
  
+static inline int32_t extract32(uint32_t value, int start, int length)

+{
+int32_t ret;
+
+if ( !(start >= 0 && length > 0 && length <= 32 - start) )
+return -EINVAL;
+
+ret = (value >> start) & (~0U >> (32 - length));
+
+return ret;
+}
+
+static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt 
*dabt)
+{
+uint32_t instr;
+int size;
+int v;
+int opc;
+int rt;
+int imm9;
+
+/* For details on decoding, refer to Armv8 Architecture reference manual
+ * Section - "Load/store register (immediate post-indexed)", Pg 318
+*/
+if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
+return -EFAULT;
+
+/* First, let's check for the fixed values */
+
+/*  As per the "Encoding table for the Loads and Stores group", Pg 299
+ * op4 = 1 - Load/store register (immediate post-indexed)
+ */
+if ( extract32(instr, 10, 2) != 1 )
+goto bad_64bit_loadstore;
+
+/* For the following, refer to "Load/store register (immediate 
post-indexed)"
+ * to get the fixed values at various bit positions.
+ */
+if ( extract32(instr

Re: [PATCH V3 4/6] xen/unpopulated-alloc: Add mechanism to use Xen resource

2021-11-26 Thread Oleksandr



On 26.11.21 17:17, Boris Ostrovsky wrote:

Hi Boris




On 11/24/21 3:53 PM, Oleksandr Tyshchenko wrote:

+    if (target_resource != &iomem_resource) {
+    tmp_res = kzalloc(sizeof(*tmp_res), GFP_KERNEL);
+    if (!res) {



If (!tmp_res)



Good catch, thank you!






+    ret = -ENOMEM;
+    goto err_insert;
+    }


--
Regards,

Oleksandr Tyshchenko




Re: [PATCH 01/65] x86: Introduce support for CET-IBT

2021-11-26 Thread Andrew Cooper
On 26/11/2021 14:10, Jan Beulich wrote:
> On 26.11.2021 13:33, Andrew Cooper wrote:
>> @@ -124,6 +129,18 @@ config XEN_SHSTK
>>When CET-SS is active, 32bit PV guests cannot be used.  Backwards
>>compatiblity can be provided via the PV Shim mechanism.
>>  
>> +config XEN_IBT
>> +bool "Supervisor Indirect Branch Tracking"
>> +depends on HAS_CC_CET_IBT
>> +default y
>> +help
>> +  Control-flow Enforcement Technology (CET) is a set of features in
>> +  hardware designed to combat Return-oriented Programming (ROP, also
>> +  call/jump COP/JOP) attacks.  Indirect Branch Tracking is one CET
>> +  feature designed to provide function pointer protection.
>> +
>> +  This option arranges for Xen to use CET-IBT for its own protection.
> Shouldn't this depend on BROKEN until it's actually functional?

It compiles fine right from now, and making it BROKEN would inhibit
bisection through the series.

Nothing actually matters until patch 65 turns on MSR_S_CET.ENDBR_EN.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>> @@ -35,6 +35,11 @@
>>  # error Unknown compilation width
>>  #endif
>>  
>> +#ifndef cf_check
>> +/* Cope with userspace build not knowing about CET-IBT */
>> +#define cf_check
>> +#endif
> Imo this shouldn't go here, but in tools/tests/x86_emulator/x86-emulate.h,
> and then presumably without #ifdef.

I considered that, but the test harness isn't the only userspace
harness.  There is the fuzzing harness too, and I'm not sure we want to
force every userspace harness to provide the same workaround.

~Andrew



Re: [PATCH V3 4/6] xen/unpopulated-alloc: Add mechanism to use Xen resource

2021-11-26 Thread Boris Ostrovsky



On 11/24/21 3:53 PM, Oleksandr Tyshchenko wrote:

+   if (target_resource != &iomem_resource) {
+   tmp_res = kzalloc(sizeof(*tmp_res), GFP_KERNEL);
+   if (!res) {



If (!tmp_res)



+   ret = -ENOMEM;
+   goto err_insert;
+   }




Re: [PATCH 0/2] xen: make debugreg accessors always_inline

2021-11-26 Thread Boris Ostrovsky



On 11/25/21 4:20 AM, Juergen Gross wrote:

Juergen Gross (2):
   xen: make HYPERVISOR_get_debugreg() always_inline
   xen: make HYPERVISOR_set_debugreg() always_inline

  arch/x86/include/asm/xen/hypercall.h | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)




Applied to for-linus-5.16c


-boris




Re: [RFC?] xen/arm: memaccess: Pass struct npfec by reference in p2m_mem_access_check

2021-11-26 Thread Oleksandr



On 26.11.21 13:39, Andrew Cooper wrote:


Hi Andrew


On 26/11/2021 07:46, Jan Beulich wrote:

On 25.11.2021 23:49, Oleksandr Tyshchenko wrote:

From: Oleksandr Tyshchenko 

Today I noticed a "note" when building Xen on Arm64 with
aarch64-poky-linux-gcc (GCC) 9.3.0. It turned out that Andrew Cooper
had alredy reported it before [1]:

mem_access.c: In function 'p2m_mem_access_check':
mem_access.c:227:6: note: parameter passing for argument of type
'const struct npfec' changed in GCC 9.1
   227 | bool p2m_mem_access_check(paddr_t gpa, vaddr_t gla,
   const struct npfec npfec)

 From the explanation I understand that nothing bad actually is going
to happen in our case, it is harmless and shown to only draw our
attention that the ABI changed due to bug (with passing bit-fields
by value) fixed in GCC 9.1. This information doesn't mean much for us
as Xen is an embedded project with no external linkage. But, of course,
it would be better to eliminate the note. You can also find related
information about the bug at [2].

So make the note go away by passing bit-fields by reference.

[1] https://www.mail-archive.com/xen-devel@lists.xenproject.org/msg87439.html
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88469

Reported-by: Andrew Cooper 
Signed-off-by: Oleksandr Tyshchenko 
---
Compile-tested only.
---
  xen/arch/arm/mem_access.c| 28 ++--
  xen/arch/arm/traps.c |  2 +-
  xen/include/asm-arm/mem_access.h |  2 +-
  3 files changed, 16 insertions(+), 16 deletions(-)

It's all Arm code, so I'm not the one to judge, but I'd like to recommend
to live with the note or convince distros to backport the gcc side fix.
This definitely was a compiler flaw; see
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91710.

I too would recommend just living with the note.  The code change
proposed is a backwards step in terms of runtime complexity - you're now
passing around a pointer to 7 bits of information, which the compiler
cannot pull into a local because of C's aliasing rules.  At a guess, the
very best an optimising compiler could do is turn it into only two
dereferences of the pointer.


Thank you for the analysis. I don't think, we want to make things worse 
(less optimal) than they are currently.





~Andrew


--
Regards,

Oleksandr Tyshchenko




Re: [PATCH 04/65] x86/hypercall: Annotate fnptr targets

2021-11-26 Thread Andrew Cooper
On 26/11/2021 14:21, Jan Beulich wrote:
> On 26.11.2021 13:33, Andrew Cooper wrote:
>> Signed-off-by: Andrew Cooper 
> I understand there's not much to say here, but the title saying just
> "annotate" without any context as to the purpose of the annotation
> is too little information imo. I guess this then goes for many more
> titles in this series.

I really couldn't think of anything useful to say.  Lots of these
patches are entirely mechanical.

>
>> --- a/xen/include/xen/hypercall.h
>> +++ b/xen/include/xen/hypercall.h
>> @@ -18,12 +18,12 @@
>>  #include 
>>  #include 
>>  
>> -extern long
>> +extern long cf_check
>>  do_sched_op(
>>  int cmd,
>>  XEN_GUEST_HANDLE_PARAM(void) arg);
> What purpose does the attribute serve on a declaration? On the surface
> I would consider it meaningful only on definitions, like e.g. __init.

Because GCC treats cf_check (and nocf_check) as part of the function
type.  Simply getting it wrong will yield a "definition doesn't match
prototype" error.

Furthermore, it needs to be visible across translation units so one TU
can spot (and complain at) creating a function pointer to a non-local
non-endbr'd function.

~Andrew



Re: [PATCH 04/65] x86/hypercall: Annotate fnptr targets

2021-11-26 Thread Jan Beulich
On 26.11.2021 13:33, Andrew Cooper wrote:
> Signed-off-by: Andrew Cooper 

I understand there's not much to say here, but the title saying just
"annotate" without any context as to the purpose of the annotation
is too little information imo. I guess this then goes for many more
titles in this series.

> --- a/xen/include/xen/hypercall.h
> +++ b/xen/include/xen/hypercall.h
> @@ -18,12 +18,12 @@
>  #include 
>  #include 
>  
> -extern long
> +extern long cf_check
>  do_sched_op(
>  int cmd,
>  XEN_GUEST_HANDLE_PARAM(void) arg);

What purpose does the attribute serve on a declaration? On the surface
I would consider it meaningful only on definitions, like e.g. __init.

Jan




Re: [RFC?] xen/arm: memaccess: Pass struct npfec by reference in p2m_mem_access_check

2021-11-26 Thread Oleksandr



On 26.11.21 09:46, Jan Beulich wrote:

Hi Jan


On 25.11.2021 23:49, Oleksandr Tyshchenko wrote:

From: Oleksandr Tyshchenko 

Today I noticed a "note" when building Xen on Arm64 with
aarch64-poky-linux-gcc (GCC) 9.3.0. It turned out that Andrew Cooper
had alredy reported it before [1]:

mem_access.c: In function 'p2m_mem_access_check':
mem_access.c:227:6: note: parameter passing for argument of type
'const struct npfec' changed in GCC 9.1
   227 | bool p2m_mem_access_check(paddr_t gpa, vaddr_t gla,
   const struct npfec npfec)

 From the explanation I understand that nothing bad actually is going
to happen in our case, it is harmless and shown to only draw our
attention that the ABI changed due to bug (with passing bit-fields
by value) fixed in GCC 9.1. This information doesn't mean much for us
as Xen is an embedded project with no external linkage. But, of course,
it would be better to eliminate the note. You can also find related
information about the bug at [2].

So make the note go away by passing bit-fields by reference.

[1] https://www.mail-archive.com/xen-devel@lists.xenproject.org/msg87439.html
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88469

Reported-by: Andrew Cooper 
Signed-off-by: Oleksandr Tyshchenko 
---
Compile-tested only.
---
  xen/arch/arm/mem_access.c| 28 ++--
  xen/arch/arm/traps.c |  2 +-
  xen/include/asm-arm/mem_access.h |  2 +-
  3 files changed, 16 insertions(+), 16 deletions(-)

It's all Arm code, so I'm not the one to judge, but I'd like to recommend
to live with the note or convince distros to backport the gcc side fix.
This definitely was a compiler flaw; see
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91710.


Thank you for the pointer and suggestion. Actually, after the 
realization that note is harmless and doesn't matter in our case, we 
could indeed tolerate it.


It is up to the maintainers to decide. I will be ok either way.




Jan


--
Regards,

Oleksandr Tyshchenko




Re: [PATCH 01/65] x86: Introduce support for CET-IBT

2021-11-26 Thread Jan Beulich
On 26.11.2021 13:33, Andrew Cooper wrote:
> @@ -124,6 +129,18 @@ config XEN_SHSTK
> When CET-SS is active, 32bit PV guests cannot be used.  Backwards
> compatiblity can be provided via the PV Shim mechanism.
>  
> +config XEN_IBT
> + bool "Supervisor Indirect Branch Tracking"
> + depends on HAS_CC_CET_IBT
> + default y
> + help
> +   Control-flow Enforcement Technology (CET) is a set of features in
> +   hardware designed to combat Return-oriented Programming (ROP, also
> +   call/jump COP/JOP) attacks.  Indirect Branch Tracking is one CET
> +   feature designed to provide function pointer protection.
> +
> +   This option arranges for Xen to use CET-IBT for its own protection.

Shouldn't this depend on BROKEN until it's actually functional?

> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -35,6 +35,11 @@
>  # error Unknown compilation width
>  #endif
>  
> +#ifndef cf_check
> +/* Cope with userspace build not knowing about CET-IBT */
> +#define cf_check
> +#endif

Imo this shouldn't go here, but in tools/tests/x86_emulator/x86-emulate.h,
and then presumably without #ifdef.

Jan




Re: [RFC PATCH V3] xen/gnttab: Store frame GFN in struct page_info on Arm

2021-11-26 Thread Oleksandr



On 25.11.21 21:04, Julien Grall wrote:

Hi Oleksandr,

Apologies for the late answer. I was waiting for XSA-387 to go out 
before commenting.



Hi Julien,


I got it, no problem





On 23/09/2021 20:32, Oleksandr Tyshchenko wrote:

From: Oleksandr Tyshchenko 

Rework Arm implementation to store grant table frame GFN
in struct page_info directly instead of keeping it in
standalone status/shared arrays.

To cover 64-bit/40-bit IPA on Arm64/Arm32 we need the space
to hold 52-bit/28-bit + extra bit value respectively. In order
to not grow the size of struct page_info borrow the required
amount of bits from type_info's count portion which current
context won't suffer (currently only 1 bit is used on Arm).
Please note, to minimize code changes and avoid introducing
an extra #ifdef-s to the header, we keep the same amount of
bits on both subarches, although the count portion on Arm64
could be wider, so we waste some bits here.

Introduce corresponding PGT_* constructs and access macros.
Update existing gnttab macros to deal with GFN value according
to new location. Also update the use of count portion on Arm
in share_xen_page_with_guest().

Update the P2M code to clean said GFN portion when putting
a reference on the grant table page in p2m_put_l3_page().
The added check is based on the assumption that grant table page
is the xen_heap page and its entry has p2m_ram_rw type, which
is correct. However, this check is not entirely precise and we
might end up clearing the GFN portion for other xen_heap pages
with the same p2m_type. But, this action is considered as
harmless, since only grant table pages really use that portion.

And for everything to work correctly introduce arch-specific
macros to be called from alloc_xenheap_pages()/free_xenheap_pages()
which purposes on Arm are to clear the portion before use and
make sure the portion is cleared after use, on x86 these are
just stubs.

This patch is intended to fix the potential issue on Arm
which might happen when remapping grant-table frame.
A guest (or the toolstack) will unmap the grant-table frame
using XENMEM_remove_physmap. This is a generic hypercall,
so on x86, we are relying on the fact the M2P entry will
be cleared on removal. For architecture without the M2P,
the GFN would still be present in the grant frame/status
array. So on the next call to map the page, we will end up to
request the P2M to remove whatever mapping was the given GFN.
This could well be another mapping.

Besides that, this patch simplifies arch code on Arm by
removing arrays and corresponding management code and
as the result gnttab_init_arch/gnttab_destroy_arch helpers
and struct grant_table_arch become useless and can be
dropped globally.


Before dropping the arch specific helpers, I would check with the 
RISC-v folks that they will not need it in the near future.


arch/riscv/configs/tiny64_defconfig says that CONFIG_GRANT_TABLE is not 
set, for me it sounds like unlikely for *the near* future. But, anyway, 
it would be better to clarify.







Suggested-by: Julien Grall 
Signed-off-by: Oleksandr Tyshchenko 
---
You can find the related discussions at:
https://lore.kernel.org/xen-devel/93d0df14-2c8a-c2e3-8c51-544121901...@xen.org/ 

https://lore.kernel.org/xen-devel/1628890077-12545-1-git-send-email-olekst...@gmail.com/ 

https://lore.kernel.org/xen-devel/1631652245-30746-1-git-send-email-olekst...@gmail.com/ 



! Please note, there is still unresolved locking question here for which
I failed to find a suitable solution. So, it is still an RFC !

According to the internal conversation:
Now the GFN field in the struct page_info is accessed from
gnttab_set_frame_gfn() in the grant table code and from 
page_set_frame_gfn()

in the P2M code (the former uses the latter).

We need to prevent the concurrent access to this field. But, we 
cannot grab
the grant lock from the P2M code because we will introduce a lock 
inversion.
The page_set_frame_gfn() will be called from the P2M code with the 
p2m lock held
and then acquire the grant table lock. The gnttab_map_frame() will do 
the inverse.


This is a tricky one. I think, we will:

  1) Need to use the P2M lock to protect the access to the GFN in the 
struct page_info *.
  2) Defer the call to page_set_frame_gfn() from gnttab_map_frame() to 
xenmem_add_to_physmap_one()
  3) In xenmem_add_to_physmap_one() hold the P2M lock while checking 
the page was not already mapped (e.g. page_get_frame_gfn() == 
INVALID_GFN) and do the mapping. Call page_set_frame_gfn() on success.


This would still allow the guest to shot itself in the foot (e.g. 
potentially removing the wrong mapping) if it tries concurrent 
hypercall but I believe we would not introduce issue like XSA-380.


At the end this would look quite similar to how x86 deal with the M2P 
update.


Thank you for the suggestion, I need to analyze the code to better 
understand your idea and technical possibility to implement it, I will 
come up with questions if any.






For the rec

[qemu-mainline test] 166370: tolerable FAIL - PUSHED

2021-11-26 Thread osstest service owner
flight 166370 qemu-mainline real [real]
http://logs.test-lab.xenproject.org/osstest/logs/166370/

Failures :-/ but no regressions.

Regressions which are regarded as allowable (not blocking):
 test-armhf-armhf-xl-rtds 14 guest-start  fail REGR. vs. 166300

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 166300
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 166300
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 166300
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 166300
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 166300
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 166300
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 166300
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 166300
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass

version targeted for testing:
 qemuu14d02cfbe4adaeebe7cb833a8cc71191352cf03b
baseline version:
 qemuu89d2f9e4c63799f7f03e9180c63b7dc45fc2a04a

Last test of basis   166300  2021-11-22 21:08:31 Z3 days
Failing since166307  2021-11-23 08:11:25 Z3 days3 attempts
Testing same since   166370  2021-11-25 03:10:11 Z1 days1 attempts


People who touched revisions under test

Re: [PATCH 00/65] x86: Support for CET Indirect Branch Tracking

2021-11-26 Thread Jan Beulich
On 26.11.2021 14:13, Andrew Cooper wrote:
> On 26/11/2021 12:48, Jan Beulich wrote:
>> On 26.11.2021 13:33, Andrew Cooper wrote:
>>>   * I have not checked for misaligned endbr64's, and I'm not sure there is
>>> anything useful we could do upon discovering that there were any.
>>> Naively, there is a 1 in 2^32 chance (endbr64 being 4 bytes long), but
>>> this doesn't account for the structure of x86 code, which is most
>>> certainly not a uniform random distribution of bytes.
>> Do you really mean "misaligned" here? The 2nd sentence rather might suggest
>> that you mean byte sequences resembling ENDBR, despite actually being part
>> of other insns. If so, checking might not allow to prove anything, as e.g.
>> displacements change with about every build.
> 
> I do mean "any sequence of bytes resembling ENDBR", because that is
> ultimately how the CPU instruction decode will behave.
> 
> And yes - you certainly can hide it in a 4-byte disp/imm, but it's an
> incredibly rare imm32 to find (except for tasks such as in patch 64). 

A disp alone won't do in general, as the top byte will only ever be 0x00
or 0xFF (as long as our binary image doesn't go beyond 16Mb). But a
ModR/M or SIB byte could start such a sequence, with only two or three
of the (lower) disp bytes used to complete the pattern.

> You can also hide it in an disp/imm8 followed by a specific nopl, but
> I'm not sure if we'd ever emit 0F 1E FA as a nopl by default.

We don't, and the tool chain doesn't either. Only canonical NOPs (opcode
0x1F) are to be used there, as all others may gain a meaning beyond
plain NOP.

Jan




Re: [PATCH 00/65] x86: Support for CET Indirect Branch Tracking

2021-11-26 Thread Andrew Cooper
On 26/11/2021 13:13, Andrew Cooper wrote:
> On 26/11/2021 12:48, Jan Beulich wrote:
>> On 26.11.2021 13:33, Andrew Cooper wrote:
>>> Various note accumulated through the work:
>>>   * I have already posted patches fixing some of the most egregious 
>>> (ab)uses of
>>> function pointers.  There are plenty of other areas which could do with
>>> cleanup.
>>>   * With everything turned on, we get 1688 runtime endbr64's, and 233 init
>>> time.  The number of runtime endbr64's is expected to reduce with
>>> Juergen's hypercall series (see later), and in common deployment cases
>>> where not everything is compiled in by default.
>>>   * I have not checked for misaligned endbr64's, and I'm not sure there is
>>> anything useful we could do upon discovering that there were any.
>>> Naively, there is a 1 in 2^32 chance (endbr64 being 4 bytes long), but
>>> this doesn't account for the structure of x86 code, which is most
>>> certainly not a uniform random distribution of bytes.
>> Do you really mean "misaligned" here? The 2nd sentence rather might suggest
>> that you mean byte sequences resembling ENDBR, despite actually being part
>> of other insns. If so, checking might not allow to prove anything, as e.g.
>> displacements change with about every build.
> I do mean "any sequence of bytes resembling ENDBR", because that is
> ultimately how the CPU instruction decode will behave.
>
> And yes - you certainly can hide it in a 4-byte disp/imm, but it's an
> incredibly rare imm32 to find (except for tasks such as in patch 64).

To this point, I have a cunning idea.  I'll write a custom is_endbr64()
helper which reads a dword, not's it, and then compares to imm32.  That
is for all intents and purposes the same performance, but doesn't have
an embedded endbr64.

~Andrew



Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions

2021-11-26 Thread Andre Przywara
On Fri, 19 Nov 2021 16:52:02 +
Ayan Kumar Halder  wrote:

Hi,

> At present, post indexing instructions are not emulated by Xen.
> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> result, data abort is triggered.
> 
> Added the logic to decode ldr/str post indexing instructions.
> With this, Xen can decode instructions like these:-
> ldr w2, [x1], #4
> Thus, domU can read ioreg with post indexing instructions.

Where do those instructions come from? A (C) compiler? (Some mail in
another thread from Stefano suggests so)
If yes, I would argue that is broken:
IIUC C compilers assume normal memory attributes for every pointer they
handle, so they are free to use unaligned accesses, load/store exclusives,
split accesses (two halfword reads) and what not when generating code.
The GIC needs to be mapped as device memory, which explicitly forbids
unaligned accesses and exclusives (as in: always traps), so you cannot let
compiler-generated code access the GIC (or most other MMIO devices, for
that matter).
I know, this somewhat works(TM) in practise, because a uint32_t assignment
is very likely to end up in an ldr/str, but please let me know which car
this code ends up in, so that can I avoid this brand ;-)

You can tell the compiler to avoid unaligned accesses with -mstrict-align
(and should definitely do so when you are running C code with the MMU
off), but that still leaves exclusives and split accesses at the
compiler's discretion. A variation on the topic of split access is merged
writes, where the compiler uses NEON or SVE instructions, for instance, to
cover multiple words at once, possibly via some memset()/memcpy() routine.

On top there is this architectural restriction of the ARMv7/v8
virtualisation extension to not decode many "advanced" load/store
instructions in ESR_EL2.
Linux deliberately coded readl/writel using inline assembly, to only use
instructions that provide syndrome information, plus guarantee
device-memory compatible semantics.
Check out https://lwn.net/Articles/698014/ for a comprehensive discussion
of this whole MMIO topic.

So I think you should do the same in your guest/bare metal code: define
{read,write}{b,h,l,q} as inline assembly functions, using ldr?/str? only.
See xen/include/asm-arm/arm64/io.h for an example that uses static inline
functions in a header file, to generate most optimal code. Then always do
MMIO only via those accessors. That prevents any future compiler
surprises, plus makes you perfectly virtualisable.

Cheers,
Andre.

> Signed-off-by: Ayan Kumar Halder 
> ---
> Note to reviewer:-
> This patch is based on an issue discussed in 
> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> 
> 
>  xen/arch/arm/decode.c | 77 +++
>  xen/arch/arm/io.c | 14 ++--
>  2 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> index 792c2e92a7..7b60bedbc5 100644
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>  return 1;
>  }
>  
> +static inline int32_t extract32(uint32_t value, int start, int length)
> +{
> +int32_t ret;
> +
> +if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +return -EINVAL;
> +
> +ret = (value >> start) & (~0U >> (32 - length));
> +
> +return ret;
> +}
> +
> +static int decode_64bit_loadstore_postindexing(register_t pc, struct 
> hsr_dabt *dabt)
> +{
> +uint32_t instr;
> +int size;
> +int v;
> +int opc;
> +int rt;
> +int imm9;
> +
> +/* For details on decoding, refer to Armv8 Architecture reference manual
> + * Section - "Load/store register (immediate post-indexed)", Pg 318
> +*/
> +if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
> +return -EFAULT;
> +
> +/* First, let's check for the fixed values */
> +
> +/*  As per the "Encoding table for the Loads and Stores group", Pg 299
> + * op4 = 1 - Load/store register (immediate post-indexed)
> + */
> +if ( extract32(instr, 10, 2) != 1 )
> +goto bad_64bit_loadstore;
> +
> +/* For the following, refer to "Load/store register (immediate 
> post-indexed)"
> + * to get the fixed values at various bit positions.
> + */
> +if ( extract32(instr, 21, 1) != 0 )
> +goto bad_64bit_loadstore;
> +
> +if ( extract32(instr, 24, 2) != 0 )
> +goto bad_64bit_loadstore;
> +
> +if ( extract32(instr, 27, 3) != 7 )
> +goto bad_64bit_loadstore;
> +
> +size = extract32(instr, 30, 2);
> +v = extract32(instr, 26, 1);
> +opc = extract32(instr, 22, 1);
> +
> +/* At the moment, we support STR(immediate) - 32 bit variant and
> + * LDR(immediate) - 32 bit variant only.
> + */
> +if (!((size==2) && (v==0) && ((opc==0) || (opc==1
> +goto bad_64bit_loadstore;
> +

Re: [PATCH 00/65] x86: Support for CET Indirect Branch Tracking

2021-11-26 Thread Andrew Cooper
On 26/11/2021 12:48, Jan Beulich wrote:
> On 26.11.2021 13:33, Andrew Cooper wrote:
>> CET Indirect Branch Tracking is a hardware feature designed to protect 
>> against
>> forward-edge control flow hijacking (Call/Jump oriented programming), and is 
>> a
>> companion feature to CET Shadow Stacks added in Xen 4.14.
>>
>> This series depends on lots of previously posted patches.  See
>> xenbits/xen-cet-ibt for the full branch with all dependencies.
>>
>> Patch 1 introduces some compile time infrastructure.
>>
>> Patches 2 thru 56 annotate all function pointer targets in the common and x86
>> hypervisor code.  Patches are split by API and in no particular order, and
>> largely mechanical.  As such, I'm limiting review mainly to The Rest.  While
>> doing this work does depend on an experimental GCC change (patch 56), the
>> result does actually work properly with GCC 9 onwards.
> I wonder what this means. Are you talking about a gcc 9 with the experimental
> change backported?

No - plain GCC 9 as released (give or take the bug with retpoline which
was fixed in 9.4).  See patch 1.

This entire series, on GCC 9.4 or 10, will compile and function
correctly with CET-IBT active in hardware.

> Or are you saying that things build fine there (but don't
> work as far as IBT is concerned) in the absence of the experimental change?
> In which case what about older gcc?

The only thing the experimental change does is provide more
typechecking, so the compiler can identify when there is a call to a
non-ENDBR'd function.  See patch 56.

There is no possible way I could have done this work without the
experimental change, because there are far too many function pointers to
have found blind.

The typechecking isn't perfect, but it's pretty good.  In the short
term, we're going to have to be careful with new code, and I ought to
put something in Gitlab CI.  In the longer term, I hope for something
suitable to get into GCC 12.

That said, there are also a huge number of errors new in GCC 12 to do
with array bounds checks, and I'm not sure sprinkling more gcc11_wrap()
is going to work this time.

>> Various note accumulated through the work:
>>   * I have already posted patches fixing some of the most egregious (ab)uses 
>> of
>> function pointers.  There are plenty of other areas which could do with
>> cleanup.
>>   * With everything turned on, we get 1688 runtime endbr64's, and 233 init
>> time.  The number of runtime endbr64's is expected to reduce with
>> Juergen's hypercall series (see later), and in common deployment cases
>> where not everything is compiled in by default.
>>   * I have not checked for misaligned endbr64's, and I'm not sure there is
>> anything useful we could do upon discovering that there were any.
>> Naively, there is a 1 in 2^32 chance (endbr64 being 4 bytes long), but
>> this doesn't account for the structure of x86 code, which is most
>> certainly not a uniform random distribution of bytes.
> Do you really mean "misaligned" here? The 2nd sentence rather might suggest
> that you mean byte sequences resembling ENDBR, despite actually being part
> of other insns. If so, checking might not allow to prove anything, as e.g.
> displacements change with about every build.

I do mean "any sequence of bytes resembling ENDBR", because that is
ultimately how the CPU instruction decode will behave.

And yes - you certainly can hide it in a 4-byte disp/imm, but it's an
incredibly rare imm32 to find (except for tasks such as in patch 64). 
You can also hide it in an disp/imm8 followed by a specific nopl, but
I'm not sure if we'd ever emit 0F 1E FA as a nopl by default.

~Andrew



[PATCH 56/65] x86: Use control flow typechecking where possible

2021-11-26 Thread Andrew Cooper
Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 

RFC.  This is still an experimental compiler extention
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102953

However, it is also the entire basis of being able to sanely use
-mmanual-endbr in the first place, so is very important.
---
 xen/arch/x86/arch.mk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xen/arch/x86/arch.mk b/xen/arch/x86/arch.mk
index 1c8381f7c9d8..429a9ea00f92 100644
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -48,6 +48,7 @@ CFLAGS-$(CONFIG_INDIRECT_THUNK) += -fno-jump-tables
 
 ifdef CONFIG_HAS_CC_CET_IBT
 CFLAGS += -fcf-protection=branch -mmanual-endbr
+$(call cc-option-add,CFLAGS,CC,-fcf-check-attribute=no)
 else
 $(call cc-option-add,CFLAGS,CC,-fcf-protection=none)
 endif
-- 
2.11.0




[PATCH 46/65] x86/p2m: Annotate fnptr targets

2021-11-26 Thread Andrew Cooper
Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/arch/x86/mm/hap/hap.c|  2 +-
 xen/arch/x86/mm/hap/nested_hap.c |  2 +-
 xen/arch/x86/mm/p2m-ept.c| 32 +++-
 xen/arch/x86/mm/p2m-pt.c | 19 +--
 xen/include/asm-x86/p2m.h|  4 ++--
 5 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 9d67a47f5fe9..c19e337d6585 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -778,7 +778,7 @@ static void cf_check hap_update_paging_modes(struct vcpu *v)
 put_gfn(d, cr3_gfn);
 }
 
-static void
+static void cf_check
 hap_write_p2m_entry_post(struct p2m_domain *p2m, unsigned int oflags)
 {
 struct domain *d = p2m->domain;
diff --git a/xen/arch/x86/mm/hap/nested_hap.c b/xen/arch/x86/mm/hap/nested_hap.c
index 50fa2dd9f405..aa8495be4510 100644
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -71,7 +71,7 @@
 /*NESTED VIRT P2M FUNCTIONS */
 //
 
-void
+void cf_check
 nestedp2m_write_p2m_entry_post(struct p2m_domain *p2m, unsigned int oflags)
 {
 if ( oflags & _PAGE_PRESENT )
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index ac36afcc1d64..d2c540b81fdf 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -625,7 +625,7 @@ int epte_get_entry_emt(struct domain *d, gfn_t gfn, mfn_t 
mfn,
  * - zero if no adjustment was done,
  * - a positive value if at least one adjustment was done.
  */
-static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
+static int cf_check resolve_misconfig(struct p2m_domain *p2m, unsigned long 
gfn)
 {
 struct ept_data *ept = &p2m->ept;
 unsigned int level = ept->wl;
@@ -794,7 +794,7 @@ bool_t ept_handle_misconfig(uint64_t gpa)
  *
  * Returns: 0 for success, -errno for failure
  */
-static int
+static int cf_check
 ept_set_entry(struct p2m_domain *p2m, gfn_t gfn_, mfn_t mfn,
   unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma,
   int sve)
@@ -1003,10 +1003,9 @@ ept_set_entry(struct p2m_domain *p2m, gfn_t gfn_, mfn_t 
mfn,
 }
 
 /* Read ept p2m entries */
-static mfn_t ept_get_entry(struct p2m_domain *p2m,
-   gfn_t gfn_, p2m_type_t *t, p2m_access_t* a,
-   p2m_query_t q, unsigned int *page_order,
-   bool_t *sve)
+static mfn_t cf_check ept_get_entry(
+struct p2m_domain *p2m, gfn_t gfn_, p2m_type_t *t, p2m_access_t* a,
+p2m_query_t q, unsigned int *page_order, bool_t *sve)
 {
 ept_entry_t *table =
 map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
@@ -1166,8 +1165,8 @@ void ept_walk_table(struct domain *d, unsigned long gfn)
 return;
 }
 
-static void ept_change_entry_type_global(struct p2m_domain *p2m,
- p2m_type_t ot, p2m_type_t nt)
+static void cf_check ept_change_entry_type_global(
+struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt)
 {
 unsigned long mfn = p2m->ept.mfn;
 
@@ -1178,10 +1177,9 @@ static void ept_change_entry_type_global(struct 
p2m_domain *p2m,
 ept_sync_domain(p2m);
 }
 
-static int ept_change_entry_type_range(struct p2m_domain *p2m,
-   p2m_type_t ot, p2m_type_t nt,
-   unsigned long first_gfn,
-   unsigned long last_gfn)
+static int cf_check ept_change_entry_type_range(
+struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt,
+unsigned long first_gfn, unsigned long last_gfn)
 {
 unsigned int i, wl = p2m->ept.wl;
 unsigned long mask = (1 << EPT_TABLE_ORDER) - 1;
@@ -1225,7 +1223,7 @@ static int ept_change_entry_type_range(struct p2m_domain 
*p2m,
 return rc < 0 ? rc : 0;
 }
 
-static void ept_memory_type_changed(struct p2m_domain *p2m)
+static void cf_check ept_memory_type_changed(struct p2m_domain *p2m)
 {
 unsigned long mfn = p2m->ept.mfn;
 
@@ -1284,7 +1282,7 @@ void ept_sync_domain(struct p2m_domain *p2m)
 ept_sync_domain_mask(p2m, d->dirty_cpumask);
 }
 
-static void ept_tlb_flush(struct p2m_domain *p2m)
+static void cf_check ept_tlb_flush(struct p2m_domain *p2m)
 {
 ept_sync_domain_mask(p2m, p2m->domain->dirty_cpumask);
 }
@@ -1347,7 +1345,7 @@ static void ept_disable_pml(struct p2m_domain *p2m)
 vmx_domain_update_eptp(p2m->domain);
 }
 
-static void ept_enable_hardware_log_dirty(struct p2m_domain *p2m)
+static void cf_check ept_enable_hardware_log_dirty(struct p2m_domain *p2m)
 {
 struct p2m_domain *hostp2m = p2m_get_hostp2m(p2m->domain);
 
@@ -1356,7 +1354,7 @@ static void ept_enable_hardware_log_dirty(struct 
p2m_domain *p2m)
 p2m_unlock(hostp2m);
 }
 
-static void ept_disable_hardware_log_dirty(struct p2m_domain *p2m)
+static void cf_check ept_disable_hardware_log_

[PATCH 30/65] x86/emul: Annotate fnptr targets

2021-11-26 Thread Andrew Cooper
pv_emul_is_mem_write() only has a single user.  Having it as a static inline
is pointless because it can't be inlined to begin with.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Roger Pau Monné 
CC: Wei Liu 
---
 xen/arch/x86/hvm/emulate.c | 72 +-
 xen/arch/x86/hvm/hvm.c |  8 ++--
 xen/arch/x86/hvm/svm/svm.c |  4 +-
 xen/arch/x86/mm.c  |  4 +-
 xen/arch/x86/mm/shadow/hvm.c   |  8 ++--
 xen/arch/x86/pv/emul-gate-op.c |  5 ++-
 xen/arch/x86/pv/emul-priv-op.c | 65 +++---
 xen/arch/x86/pv/emulate.h  |  7 
 xen/arch/x86/pv/ro-page-fault.c| 25 +++-
 xen/arch/x86/x86_emulate.c | 21 +-
 xen/arch/x86/x86_emulate/x86_emulate.c | 10 ++---
 xen/arch/x86/x86_emulate/x86_emulate.h | 33 
 xen/include/asm-x86/hvm/emulate.h  |  8 ++--
 xen/include/asm-x86/mm.h   | 16 +++-
 14 files changed, 142 insertions(+), 144 deletions(-)

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index bd4e3ab6456e..952e28e5b212 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1272,7 +1272,7 @@ static int __hvmemul_read(
 return linear_read(addr, bytes, p_data, pfec, hvmemul_ctxt);
 }
 
-static int hvmemul_read(
+static int cf_check hvmemul_read(
 enum x86_segment seg,
 unsigned long offset,
 void *p_data,
@@ -1290,7 +1290,7 @@ static int hvmemul_read(
 container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
-int hvmemul_insn_fetch(
+int cf_check hvmemul_insn_fetch(
 enum x86_segment seg,
 unsigned long offset,
 void *p_data,
@@ -1337,7 +1337,7 @@ int hvmemul_insn_fetch(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_write(
+static int cf_check hvmemul_write(
 enum x86_segment seg,
 unsigned long offset,
 void *p_data,
@@ -1385,7 +1385,7 @@ static int hvmemul_write(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_rmw(
+static int cf_check hvmemul_rmw(
 enum x86_segment seg,
 unsigned long offset,
 unsigned int bytes,
@@ -1438,7 +1438,7 @@ static int hvmemul_rmw(
 return rc;
 }
 
-static int hvmemul_blk(
+static int cf_check hvmemul_blk(
 enum x86_segment seg,
 unsigned long offset,
 void *p_data,
@@ -1479,7 +1479,7 @@ static int hvmemul_blk(
 return rc;
 }
 
-static int hvmemul_write_discard(
+static int cf_check hvmemul_write_discard(
 enum x86_segment seg,
 unsigned long offset,
 void *p_data,
@@ -1490,7 +1490,7 @@ static int hvmemul_write_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_rep_ins_discard(
+static int cf_check hvmemul_rep_ins_discard(
 uint16_t src_port,
 enum x86_segment dst_seg,
 unsigned long dst_offset,
@@ -1501,7 +1501,7 @@ static int hvmemul_rep_ins_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_rep_movs_discard(
+static int cf_check hvmemul_rep_movs_discard(
enum x86_segment src_seg,
unsigned long src_offset,
enum x86_segment dst_seg,
@@ -1513,7 +1513,7 @@ static int hvmemul_rep_movs_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_rep_stos_discard(
+static int cf_check hvmemul_rep_stos_discard(
 void *p_data,
 enum x86_segment seg,
 unsigned long offset,
@@ -1524,7 +1524,7 @@ static int hvmemul_rep_stos_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_rep_outs_discard(
+static int cf_check hvmemul_rep_outs_discard(
 enum x86_segment src_seg,
 unsigned long src_offset,
 uint16_t dst_port,
@@ -1535,7 +1535,7 @@ static int hvmemul_rep_outs_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_cmpxchg_discard(
+static int cf_check hvmemul_cmpxchg_discard(
 enum x86_segment seg,
 unsigned long offset,
 void *p_old,
@@ -1547,7 +1547,7 @@ static int hvmemul_cmpxchg_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_read_io_discard(
+static int cf_check hvmemul_read_io_discard(
 unsigned int port,
 unsigned int bytes,
 unsigned long *val,
@@ -1556,7 +1556,7 @@ static int hvmemul_read_io_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_write_io_discard(
+static int cf_check hvmemul_write_io_discard(
 unsigned int port,
 unsigned int bytes,
 unsigned long val,
@@ -1565,7 +1565,7 @@ static int hvmemul_write_io_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_write_msr_discard(
+static int cf_check hvmemul_write_msr_discard(
 unsigned int reg,
 uint64_t val,
 struct x86_emulate_ctxt *ctxt)
@@ -1573,7 +1573,7 @@ static int hvmemul_write_msr_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_cache_op_discard(
+static int cf_check hvmemul_cache_op_discard(
 enum x86emul_cache_op op,
 enum x86_segment seg,
 unsigned long offset,
@@ -1582,7 +1582,7 @@ static int hvmemul_cache_op_discard(
 return X86EMUL_OKAY;
 }
 
-static int hvmemul_cmpxchg(
+static int cf

  1   2   >