Re: [PATCH] dma-contiguous: Prioritize restricted DMA pool over shared DMA pool

2022-02-16 Thread Robin Murphy

On 2022-02-15 22:43, Florian Fainelli wrote:

Some platforms might define the same memory region to be suitable for
used by a kernel supporting CONFIG_DMA_RESTRICTED_POOL while maintaining
compatibility with older kernels that do not support that. This is
achieved by declaring the node this way;


Those platforms are doing something inexplicably wrong, then.

"restricted-dma-pool" says that DMA for the device has to be bounced 
through a dedicated pool because it can't be trusted with visibility of 
regular OS memory. "reusable" tells the OS that it's safe to use the 
pool as regular OS memory while it's idle. Do you see how those concepts 
are fundamentally incompatible?


Linux is right to reject contradictory information rather than attempt 
to guess at what might be safe or not.


Furthermore, down at the practical level, a SWIOTLB pool is used for 
bouncing streaming DMA API mappings, while a coherent/CMA pool is used 
for coherent DMA API allocations; they are not functionally 
interchangeable either. If a device depends on coherent allocations 
rather than streaming DMA, it should still have a coherent pool even 
under a physical memory protection scheme, and if it needs both 
streaming DMA and coherent buffers then it can have both types of pool - 
the bindings explicitly call that out. It's true that SWIOTLB pools can 
act as an emergency fallback for small allocations for I/O-coherent 
devices, but that's not really intended to be relied upon heavily.


So no, I do not see anything wrong with the current handling of 
nonsensical DTs here, sorry.


Robin.


cma@4000 {
compatible = "restricted-dma-pool", "shared-dma-pool";
reusable;
...
};

A newer kernel would leverage the 'restricted-dma-pool' compatible
string for that reserved region, while an older kernel would use the
'shared-dma-pool' compatibility string.

Due to the link ordering between files under kernel/dma/ however,
contiguous.c would try to claim the region and we would never get a
chance for swiotlb.c to claim that reserved memory region.

To that extent, update kernel/dma/contiguous.c in order to check
specifically for the 'restricted-dma-pool' compatibility string when
CONFIG_DMA_RESTRICTED_POOL is enabled and give up that region such that
kernel/dma/swiotlb.c has a chance to claim it.

Similarly, kernel/dma/swiotlb.c is updated to remove the check for the
'reusable' property because that check is not useful. When a region is
defined with a compatible string set to 'restricted-dma-pool', no code
under kernel/dma/{contiguous,coherent}.c will match that region since
there is no 'shared-dma-pool' compatible string. If a
region is defined with a compatible string set as above with both
'restricted-dma-pool" *and* 'shared-dma-pool' however, checking for
'reusable' will prevent kernel/dma/swiotlb.c from claiming the region
while it is still perfectly suitable since contiguous.c gave it up.

Signed-off-by: Florian Fainelli 
---
  kernel/dma/contiguous.c | 7 +++
  kernel/dma/swiotlb.c| 3 +--
  2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..3c418af6d306 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -416,6 +416,13 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
  
+#ifdef CONFIG_DMA_RESTRICTED_POOL

+   if (of_flat_dt_is_compatible(node, "restricted-dma-pool")) {
+   pr_warn("Giving up %pa for restricted DMA pool\n", >base);
+   return -ENOENT;
+   }
+#endif
+
if ((rmem->base & mask) || (rmem->size & mask)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..9d6e4ae74d04 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -883,8 +883,7 @@ static int __init rmem_swiotlb_setup(struct reserved_mem 
*rmem)
  {
unsigned long node = rmem->fdt_node;
  
-	if (of_get_flat_dt_prop(node, "reusable", NULL) ||

-   of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+   if (of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 8/8] iommu/arm-smmu-v3: Make default domain type of HiSilicon PTT device to identity

2022-02-15 Thread Robin Murphy

On 2022-02-15 13:42, Will Deacon wrote:

On Tue, Feb 15, 2022 at 01:30:26PM +, Robin Murphy wrote:

On 2022-02-15 13:00, Will Deacon wrote:

On Mon, Feb 14, 2022 at 08:55:20PM +0800, Yicong Yang wrote:

On 2022/1/24 21:11, Yicong Yang wrote:

The DMA of HiSilicon PTT device can only work with identical
mapping. So add a quirk for the device to force the domain
passthrough.

Signed-off-by: Yicong Yang 
---
   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 
   1 file changed, 16 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6dc6d8b6b368..6f67a2b1dd27 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2838,6 +2838,21 @@ static int arm_smmu_dev_disable_feature(struct device 
*dev,
}
   }
+#define IS_HISI_PTT_DEVICE(pdev)   ((pdev)->vendor == PCI_VENDOR_ID_HUAWEI 
&& \
+(pdev)->device == 0xa12e)
+
+static int arm_smmu_def_domain_type(struct device *dev)
+{
+   if (dev_is_pci(dev)) {
+   struct pci_dev *pdev = to_pci_dev(dev);
+
+   if (IS_HISI_PTT_DEVICE(pdev))
+   return IOMMU_DOMAIN_IDENTITY;
+   }
+
+   return 0;
+}
+
   static struct iommu_ops arm_smmu_ops = {
.capable= arm_smmu_capable,
.domain_alloc   = arm_smmu_domain_alloc,
@@ -2863,6 +2878,7 @@ static struct iommu_ops arm_smmu_ops = {
.sva_unbind = arm_smmu_sva_unbind,
.sva_get_pasid  = arm_smmu_sva_get_pasid,
.page_response  = arm_smmu_page_response,
+   .def_domain_type= arm_smmu_def_domain_type,
.pgsize_bitmap  = -1UL, /* Restricted during device attach */
.owner  = THIS_MODULE,
   };



Is this quirk ok with the SMMU v3 driver? Just want to confirm that I'm on the
right way to dealing with the issue of our device.


I don't think the quirk should be in the SMMUv3 driver. Assumedly, you would
have the exact same problem if you stuck the PTT device behind a different
type of IOMMU, and so the quirk should be handled by a higher level of the
stack.


Conceptually, yes, but I'm inclined to be pragmatic here. Default domain
quirks could only move out as far as the other end of the call from
iommu_get_def_domain_type() - it's not like we could rely on some flag in a
driver which may not even be loaded yet, let alone matched to the device.
And even then there's an equal and opposite argument for why the core code
should have to maintain a list of platform-specific quirks rather than code
specific to the relevant platforms. The fact is that a HiSilicon RCiEP is
not going to end up behind anything other than a HiSilicon IOMMU, and if
those ever stop being SMMUv3 *and* such a quirk still exists we can worry
about it then.


Perhaps, but you know that by adding this hook it's only a matter of time
before we get random compatible string matches in there, so I'd rather keep
the flood gates closed as long as we can.

Given that this is a PCI device, why can't we have a PCI quirk for devices
which require an identity mapping and then handle that in the IOMMU core?


Oh, don't think I *like* having quirks in the driver, it just seems like 
the least-worst choice from a bad bunch. All of the default domain 
quirks so far (including this one) exist for integrated devices and/or 
dodgy firmware setups such that they are platform-specific, so there is 
no technical reason for trying to split *some* of them off into a 
generic mechanism when the driver-based platform-specific mechanism 
still needs to exist anyway (some of them do depend on driver state as 
well).


Feel free to test the waters with a patch punting 
qcom_smmu_def_domain_type() to core code, but I think you'll struggle to 
find a reason to give in the commit message other than "I don't like it".



Ugly as it is, this is the status quo. I don't recall anyone ever arguing
that the equivalent quirks for Intel integrated graphics should be made
generic ;)


I don't know anything about Intel integrated graphics. Have they solved this
problem in a better way, or could they equally make use of a generic quirk?


See intel-iommu's device_def_domain_type() implementation. The shape of 
it may seem quite familiar...


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 8/8] iommu/arm-smmu-v3: Make default domain type of HiSilicon PTT device to identity

2022-02-15 Thread Robin Murphy

On 2022-02-15 13:00, Will Deacon wrote:

On Mon, Feb 14, 2022 at 08:55:20PM +0800, Yicong Yang wrote:

On 2022/1/24 21:11, Yicong Yang wrote:

The DMA of HiSilicon PTT device can only work with identical
mapping. So add a quirk for the device to force the domain
passthrough.

Signed-off-by: Yicong Yang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 
  1 file changed, 16 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6dc6d8b6b368..6f67a2b1dd27 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2838,6 +2838,21 @@ static int arm_smmu_dev_disable_feature(struct device 
*dev,
}
  }
  
+#define IS_HISI_PTT_DEVICE(pdev)	((pdev)->vendor == PCI_VENDOR_ID_HUAWEI && \

+(pdev)->device == 0xa12e)
+
+static int arm_smmu_def_domain_type(struct device *dev)
+{
+   if (dev_is_pci(dev)) {
+   struct pci_dev *pdev = to_pci_dev(dev);
+
+   if (IS_HISI_PTT_DEVICE(pdev))
+   return IOMMU_DOMAIN_IDENTITY;
+   }
+
+   return 0;
+}
+
  static struct iommu_ops arm_smmu_ops = {
.capable= arm_smmu_capable,
.domain_alloc   = arm_smmu_domain_alloc,
@@ -2863,6 +2878,7 @@ static struct iommu_ops arm_smmu_ops = {
.sva_unbind = arm_smmu_sva_unbind,
.sva_get_pasid  = arm_smmu_sva_get_pasid,
.page_response  = arm_smmu_page_response,
+   .def_domain_type= arm_smmu_def_domain_type,
.pgsize_bitmap  = -1UL, /* Restricted during device attach */
.owner  = THIS_MODULE,
  };



Is this quirk ok with the SMMU v3 driver? Just want to confirm that I'm on the
right way to dealing with the issue of our device.


I don't think the quirk should be in the SMMUv3 driver. Assumedly, you would
have the exact same problem if you stuck the PTT device behind a different
type of IOMMU, and so the quirk should be handled by a higher level of the
stack.


Conceptually, yes, but I'm inclined to be pragmatic here. Default domain 
quirks could only move out as far as the other end of the call from 
iommu_get_def_domain_type() - it's not like we could rely on some flag 
in a driver which may not even be loaded yet, let alone matched to the 
device. And even then there's an equal and opposite argument for why the 
core code should have to maintain a list of platform-specific quirks 
rather than code specific to the relevant platforms. The fact is that a 
HiSilicon RCiEP is not going to end up behind anything other than a 
HiSilicon IOMMU, and if those ever stop being SMMUv3 *and* such a quirk 
still exists we can worry about it then.


Ugly as it is, this is the status quo. I don't recall anyone ever 
arguing that the equivalent quirks for Intel integrated graphics should 
be made generic ;)


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 5/8] iommu/amd: Use iommu_attach/detach_device()

2022-02-15 Thread Robin Murphy

On 2022-02-15 09:11, Joerg Roedel wrote:

On Mon, Feb 14, 2022 at 11:00:59AM -0400, Jason Gunthorpe wrote:

On Mon, Feb 14, 2022 at 03:23:07PM +0100, Joerg Roedel wrote:


Device drivers calling into iommu_attach_device() is seldom a good
idea.  In this case the sound device has some generic hardware
interface so that an existing sound driver can be re-used. Making this
driver call iommu-specific functions for some devices is something hard
to justify.


Er, so this is transparent to the generic sound device? I guess
something fixed up the dma_api on that device to keep working?


Right, this is completly transparent to the sound device. The IOMMU code
will not set dma_ops on the device because it uses a direct mapping and
so the standard implementation will be used.


But, then, the requirement is that nobody is using the dma API when we
make this change?


That is the tricky part. DMA-API keeps working after the change is made,
because the new domain is also direct mapped. The new domain just has
the ability to assign host page-tables to device PASIDs, so that DMA
requests with a PASID TLP will be remapped.

It was actually a requirement for this code that when it jumps in, the
DMA-API mappings stay live. And the reason a direct mapping is used at
all is that the page-table walker of the IOMMU is a two-dimensional
walker, which will treat the addresses found in the host page-tables as
IO-virtual an translates them through the underlying page-table. So to
use host-pagetables the underlying mapping must be direct mapped.


Given how things have evolved since that code was originally written, 
and that we seemingly now have the def_domain_type override kicking in 
as soon as we first see an IOMMUv2-capable device, do we even need to 
then subsequently switch to this special unmanaged domain with its 
pagetable sucked out, or could we just install the PASID table in the 
default domain itself?


Robin.


I don't think it matters how big/small the group is, only that when we
change the domain we know everything flowing through the domain is
still happy.


Yes, that matters. The group size matters too for DMA-API performance.
If two devices compete for the same lock in the allocator and/or the
same cached magazines, things will slow down. That only matters for
high-throughput devices, but still...

Regards,

Joerg


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/9] dt-bindings: Add arm,mali-valhall compatible

2022-02-15 Thread Robin Murphy

On 2022-02-11 20:27, alyssa.rosenzw...@collabora.com wrote:

From: Alyssa Rosenzweig 

 From the kernel's perspective, pre-CSF Valhall is more or less
compatible with Bifrost, although they differ to userspace. Add a
compatible for Valhall to the existing Bifrost bindings documentation.

Signed-off-by: Alyssa Rosenzweig 
Cc: devicet...@vger.kernel.org
---
  Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml | 1 +
  1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml 
b/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
index 63a08f3f321d..48aeabd2ed68 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
@@ -23,6 +23,7 @@ properties:
- rockchip,px30-mali
- rockchip,rk3568-mali
- const: arm,mali-bifrost # Mali Bifrost GPU model/revision is fully 
discoverable
+  - const: arm,mali-valhall # Mali Valhall GPU model/revision is fully 
discoverable


This requires all existing Bifrost users to add the Valhall compatible 
as well - I don't think that's what you want. From what we tossed about 
on IRC the other week, I'd imagined something more in the shape of:


  compatible:
oneOf:
  - items:
  - enum:
  - vendor,soc-mali
  - ...
  - const: arm,mali-bifrost
  - items:
  - enum:
  - vendor,soc-mali
  - ...
  - const: arm,mali-valhall
  - const: arm,mali-bifrost #or not, depending on 
forward-compatibility preferences


Cheers,
Robin.


Re: [PATCH] drm/panfrost: Dynamically allocate pm_domains

2022-02-15 Thread Robin Murphy

On 2022-02-14 20:31, Alyssa Rosenzweig wrote:

MT8192 requires 5 power domains. Rather than bump MAX_PM_DOMAINS and
waste memory on every supported Panfrost chip, instead dynamically
allocate pm_domain_devs and pm_domain_links. This adds some flexibility;
it seems inevitable a new MediaTek device will require more than 5
domains.

On non-MediaTek devices, this saves a small amount of memory.

Suggested-by: AngeloGioacchino Del Regno 

Signed-off-by: Alyssa Rosenzweig 
---
  drivers/gpu/drm/panfrost/panfrost_device.c | 14 ++
  drivers/gpu/drm/panfrost/panfrost_device.h |  5 ++---
  2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c 
b/drivers/gpu/drm/panfrost/panfrost_device.c
index ee612303f076..661cdec320af 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -127,7 +127,10 @@ static void panfrost_pm_domain_fini(struct panfrost_device 
*pfdev)
  {
int i;
  
-	for (i = 0; i < ARRAY_SIZE(pfdev->pm_domain_devs); i++) {

+   if (!pfdev->pm_domain_devs || !pfdev->pm_domain_links)
+   return;
+
+   for (i = 0; i < pfdev->comp->num_pm_domains; i++) {
if (!pfdev->pm_domain_devs[i])
break;
  
@@ -161,9 +164,12 @@ static int panfrost_pm_domain_init(struct panfrost_device *pfdev)

return -EINVAL;
}
  
-	if (WARN(num_domains > ARRAY_SIZE(pfdev->pm_domain_devs),

-   "Too many supplies in compatible structure.\n"))
-   return -EINVAL;
+   pfdev->pm_domain_devs = devm_kcalloc(pfdev->dev, num_domains,
+sizeof(*pfdev->pm_domain_devs),
+GFP_KERNEL);
+   pfdev->pm_domain_links = devm_kcalloc(pfdev->dev, num_domains,
+ sizeof(*pfdev->pm_domain_links),
+ GFP_KERNEL);


Since we're not really doing any detailed management of our device 
links, could we get away with using AUTOREMOVE_CONSUMER instead of 
STATELESS to avoid having to explicitly keep track of them ourselves?


Robin.

  
  	for (i = 0; i < num_domains; i++) {

pfdev->pm_domain_devs[i] =
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h 
b/drivers/gpu/drm/panfrost/panfrost_device.h
index 8b25278f34c8..98e3039696f9 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -22,7 +22,6 @@ struct panfrost_job;
  struct panfrost_perfcnt;
  
  #define NUM_JOB_SLOTS 3

-#define MAX_PM_DOMAINS 3
  
  struct panfrost_features {

u16 id;
@@ -87,8 +86,8 @@ struct panfrost_device {
struct regulator_bulk_data *regulators;
struct reset_control *rstc;
/* pm_domains for devices with more than one. */
-   struct device *pm_domain_devs[MAX_PM_DOMAINS];
-   struct device_link *pm_domain_links[MAX_PM_DOMAINS];
+   struct device **pm_domain_devs;
+   struct device_link **pm_domain_links;
bool coherent;
  
  	struct panfrost_features features;


Re: [PATCH 08/14] arm64: simplify access_ok()

2022-02-14 Thread Robin Murphy

On 2022-02-14 16:34, Arnd Bergmann wrote:

From: Arnd Bergmann 

arm64 has an inline asm implementation of access_ok() that is derived from
the 32-bit arm version and optimized for the case that both the limit and
the size are variable. With set_fs() gone, the limit is always constant,
and the size usually is as well, so just using the default implementation
reduces the check into a comparison against a constant that can be
scheduled by the compiler.


Aww, I still vividly remember the birth of this madness, sat with my 
phone on a Saturday morning waiting for my bike to be MOT'd, staring at 
the 7-instruction sequence that Mark and I had come up with and certain 
that it could be shortened still. Kinda sad to see it go, but at the 
same time, glad that it can.


Acked-by: Robin Murphy 


On a defconfig build, this saves over 28KB of .text.


Not to mention saving those "WTF is going on there... oh yeah, 
access_ok()" moments when looking through disassembly :)


Cheers,
Robin.


Signed-off-by: Arnd Bergmann 
---
  arch/arm64/include/asm/uaccess.h | 28 +---
  1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 357f7bd9c981..e8dce0cc5eaa 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -26,6 +26,8 @@
  #include 
  #include 
  
+static inline int __access_ok(const void __user *ptr, unsigned long size);

+
  /*
   * Test whether a block of memory is a valid user space address.
   * Returns 1 if the range is valid, 0 otherwise.
@@ -33,10 +35,8 @@
   * This is equivalent to the following test:
   * (u65)addr + (u65)size <= (u65)TASK_SIZE_MAX
   */
-static inline unsigned long __access_ok(const void __user *addr, unsigned long 
size)
+static inline int access_ok(const void __user *addr, unsigned long size)
  {
-   unsigned long ret, limit = TASK_SIZE_MAX - 1;
-
/*
 * Asynchronous I/O running in a kernel thread does not have the
 * TIF_TAGGED_ADDR flag of the process owning the mm, so always untag
@@ -46,27 +46,9 @@ static inline unsigned long __access_ok(const void __user 
*addr, unsigned long s
(current->flags & PF_KTHREAD || test_thread_flag(TIF_TAGGED_ADDR)))
addr = untagged_addr(addr);
  
-	__chk_user_ptr(addr);

-   asm volatile(
-   // A + B <= C + 1 for all A,B,C, in four easy steps:
-   // 1: X = A + B; X' = X % 2^64
-   "  adds%0, %3, %2\n"
-   // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
-   "  csel%1, xzr, %1, hi\n"
-   // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
-   //to compensate for the carry flag being set in step 4. For
-   //X > 2^64, X' merely has to remain nonzero, which it does.
-   "  csinv   %0, %0, xzr, cc\n"
-   // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
-   //comes from the carry in being clear. Otherwise, we are
-   //testing X' - C == 0, subject to the previous adjustments.
-   "  sbcsxzr, %0, %1\n"
-   "  cset%0, ls\n"
-   : "=" (ret), "+r" (limit) : "Ir" (size), "0" (addr) : "cc");
-
-   return ret;
+   return likely(__access_ok(addr, size));
  }
-#define __access_ok __access_ok
+#define access_ok access_ok
  
  #include 
  


Re: [PATCH 08/14] arm64: simplify access_ok()

2022-02-14 Thread Robin Murphy

On 2022-02-14 16:34, Arnd Bergmann wrote:

From: Arnd Bergmann 

arm64 has an inline asm implementation of access_ok() that is derived from
the 32-bit arm version and optimized for the case that both the limit and
the size are variable. With set_fs() gone, the limit is always constant,
and the size usually is as well, so just using the default implementation
reduces the check into a comparison against a constant that can be
scheduled by the compiler.


Aww, I still vividly remember the birth of this madness, sat with my 
phone on a Saturday morning waiting for my bike to be MOT'd, staring at 
the 7-instruction sequence that Mark and I had come up with and certain 
that it could be shortened still. Kinda sad to see it go, but at the 
same time, glad that it can.


Acked-by: Robin Murphy 


On a defconfig build, this saves over 28KB of .text.


Not to mention saving those "WTF is going on there... oh yeah, 
access_ok()" moments when looking through disassembly :)


Cheers,
Robin.


Signed-off-by: Arnd Bergmann 
---
  arch/arm64/include/asm/uaccess.h | 28 +---
  1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 357f7bd9c981..e8dce0cc5eaa 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -26,6 +26,8 @@
  #include 
  #include 
  
+static inline int __access_ok(const void __user *ptr, unsigned long size);

+
  /*
   * Test whether a block of memory is a valid user space address.
   * Returns 1 if the range is valid, 0 otherwise.
@@ -33,10 +35,8 @@
   * This is equivalent to the following test:
   * (u65)addr + (u65)size <= (u65)TASK_SIZE_MAX
   */
-static inline unsigned long __access_ok(const void __user *addr, unsigned long 
size)
+static inline int access_ok(const void __user *addr, unsigned long size)
  {
-   unsigned long ret, limit = TASK_SIZE_MAX - 1;
-
/*
 * Asynchronous I/O running in a kernel thread does not have the
 * TIF_TAGGED_ADDR flag of the process owning the mm, so always untag
@@ -46,27 +46,9 @@ static inline unsigned long __access_ok(const void __user 
*addr, unsigned long s
(current->flags & PF_KTHREAD || test_thread_flag(TIF_TAGGED_ADDR)))
addr = untagged_addr(addr);
  
-	__chk_user_ptr(addr);

-   asm volatile(
-   // A + B <= C + 1 for all A,B,C, in four easy steps:
-   // 1: X = A + B; X' = X % 2^64
-   "  adds%0, %3, %2\n"
-   // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
-   "  csel%1, xzr, %1, hi\n"
-   // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
-   //to compensate for the carry flag being set in step 4. For
-   //X > 2^64, X' merely has to remain nonzero, which it does.
-   "  csinv   %0, %0, xzr, cc\n"
-   // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
-   //comes from the carry in being clear. Otherwise, we are
-   //testing X' - C == 0, subject to the previous adjustments.
-   "  sbcsxzr, %0, %1\n"
-   "  cset%0, ls\n"
-   : "=" (ret), "+r" (limit) : "Ir" (size), "0" (addr) : "cc");
-
-   return ret;
+   return likely(__access_ok(addr, size));
  }
-#define __access_ok __access_ok
+#define access_ok access_ok
  
  #include 
  


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH v1 1/8] iommu: Add iommu_group_replace_domain()

2022-02-14 Thread Robin Murphy

On 2022-02-14 14:56, Jason Gunthorpe via iommu wrote:

On Mon, Feb 14, 2022 at 02:10:19PM +, Robin Murphy wrote:

On 2022-02-14 12:45, Jason Gunthorpe wrote:

On Mon, Feb 14, 2022 at 12:09:36PM +, Robin Murphy wrote:

On 2022-01-06 02:20, Lu Baolu wrote:

Expose an interface to replace the domain of an iommu group for frameworks
like vfio which claims the ownership of the whole iommu group.


But if the underlying point is the new expectation that
iommu_{attach,detach}_device() operate on the device's whole group where
relevant, why should we invent some special mechanism for VFIO to be
needlessly inconsistent?

I said before that it's trivial for VFIO to resolve a suitable device if it
needs to; by now I've actually written the patch ;)

https://gitlab.arm.com/linux-arm/linux-rm/-/commit/9f37d8c17c9b606abc96e1f1001c0b97c8b93ed5


Er, how does locking work there? What keeps busdev from being
concurrently unplugged?


Same thing that prevents the bus pointer from suddenly becoming invalid in
the current code, I guess :)


Oooh, yes, that does look broken now too. :(


How can iommu_group_get() be safely called on
this pointer?


What matters is being able to call *other* device-based IOMMU API
interfaces in the long term.


Yes, this is what I mean, those are the ones that call
iommu_group_get().


All of the above only works normally inside a probe/remove context
where the driver core is blocking concurrent unplug and descruction.

I think I said this last time you brought it up that lifetime was the
challenge with this idea.


Indeed, but it's a challenge that needs tackling, because the bus-based
interfaces need to go away. So either we figure it out now and let this
attach interface rework benefit immediately, or I spend three times as long


IMHO your path is easier if you let VFIO stay with the group interface
and use something like:

domain = iommu_group_alloc_domain(group)

Which is what VFIO is trying to accomplish. Since Lu removed the only
other user of iommu_group_for_each_dev() it means we can de-export
that interface.

This works better because the iommu code can hold the internal group
while it finds the bus/device and then invokes the driver op. We don't
have a lifetime problem anymore under that lock.


That's certainly one of the cleaner possibilities - per the theme of 
this thread I'm not hugely keen on proliferating special VFIO-specific 
versions of IOMMU APIs, but trying to take the dev->mutex might be a bit 
heavy-handed and risky, and getting at the vfio_group->device_lock a bit 
fiddly, so if I can't come up with anything nicer or more general it 
might be a fair compromise.



The remaining VFIO use of bus for iommu_capable() is better done
against the domain or the group object, as appropriate.


Indeed, although half the implementations of .capable are nonsense 
already, so I'm treating that one as a secondary priority for the moment 
(with an aim to come back afterwards and just try to kill it off as far 
as possible). RDMA and VFIO shouldn't be a serious concern for the kind 
of systems with heterogeneous IOMMUs at this point.



In the bigger picture, VFIO should stop doing
'iommu_group_alloc_domain' by moving the domain alloc to
VFIO_GROUP_GET_DEVICE_FD where we have a struct device to use.

We've already been experimenting with this for iommufd and the subtle
difference in the uapi doesn't seem relevant.


solving it on my own and end up deleting
iommu_group_replace_domain() in about 6 months' time anyway.


I expect this API to remain until we figure out a solution to the PPC
problem, and come up with an alternative way to change the attached
domain on the fly.


I though PPC wasn't using the IOMMU API at all... or is that the problem?

Thanks,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 3/8] iommu: Extend iommu_at[de]tach_device() for multi-device groups

2022-02-14 Thread Robin Murphy

On 2022-02-14 14:39, Joerg Roedel wrote:

On Mon, Feb 14, 2022 at 09:03:13AM -0400, Jason Gunthorpe wrote:

Groups should disappear into an internal implementation detail, not be
so prominent in the API.


Not going to happen, IOMMU groups are ABI and todays device assignment
code, including user-space, relies on them.

Groups implement and important aspect of hardware IOMMUs that the API
can not abstract away: That there are devices which share the same
request-id.

This is not an issue for devices concerned by iommufd, but for legacy
device assignment it is. The IOMMU-API needs to handle both in a clean
API, even if it means that drivers need to lookup the sub-group of a
device first.

And I don't see how a per-device API can handle both in a device-centric
way. For sure it is not making it 'device centric but operate on groups
under the hood'.


Arguably, iommu_attach_device() could be renamed something like 
iommu_attach_group_for_dev(), since that's effectively the semantic that 
all the existing API users want anyway (even VFIO at the high level - 
the group is the means for the user to assign their GPU/NIC/whatever 
device to their process, not the end in itself). That's just a lot more 
churn.


It's not that callers should be blind to the entire concept of groups 
altogether - they remain a significant reason why iommu_attach_device() 
might fail, for one thing - however what callers really shouldn't need 
to be bothered with is the exact *implementation* of groups. I do 
actually quite like the idea of refining the group abstraction into 
isolation groups as a superset of alias groups, but if anything that's a 
further argument for not having the guts of the current abstraction 
exposed in places that don't need to care - otherwise that would be 
liable to be a microcosm of this series in itself: widespread churn vs. 
"same name, new meaning" compromises.


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2] iommu/iova: Separate out rcache init

2022-02-14 Thread Robin Murphy

On 2022-02-03 09:59, John Garry wrote:

Currently the rcache structures are allocated for all IOVA domains, even if
they do not use "fast" alloc+free interface. This is wasteful of memory.

In addition, fails in init_iova_rcaches() are not handled safely, which is
less than ideal.

Make "fast" users call a separate rcache init explicitly, which includes
error checking.


Reviewed-by: Robin Murphy 


Signed-off-by: John Garry 
---
Differences to v1:
- Drop stubs for iova_domain_init_rcaches() and iova_domain_free_rcaches()
- Use put_iova_domain() in vdpa code

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d85d54f2b549..b22034975301 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -525,6 +525,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
struct iova_domain *iovad;
+   int ret;
  
  	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)

return -EINVAL;
@@ -559,6 +560,9 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
}
  
  	init_iova_domain(iovad, 1UL << order, base_pfn);

+   ret = iova_domain_init_rcaches(iovad);
+   if (ret)
+   return ret;
  
  	/* If the FQ fails we can simply fall back to strict mode */

if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b28c9435b898..7e9c3a97c040 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -15,13 +15,14 @@
  /* The anchor node sits above the top of the usable address space */
  #define IOVA_ANCHOR   ~0UL
  
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */

+
  static bool iova_rcache_insert(struct iova_domain *iovad,
   unsigned long pfn,
   unsigned long size);
  static unsigned long iova_rcache_get(struct iova_domain *iovad,
 unsigned long size,
 unsigned long limit_pfn);
-static void init_iova_rcaches(struct iova_domain *iovad);
  static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
*iovad);
  static void free_iova_rcaches(struct iova_domain *iovad);
  
@@ -64,8 +65,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,

iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
rb_link_node(>anchor.node, NULL, >rbroot.rb_node);
rb_insert_color(>anchor.node, >rbroot);
-   cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, 
>cpuhp_dead);
-   init_iova_rcaches(iovad);
  }
  EXPORT_SYMBOL_GPL(init_iova_domain);
  
@@ -488,6 +487,13 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)

  }
  EXPORT_SYMBOL_GPL(free_iova_fast);
  
+static void iova_domain_free_rcaches(struct iova_domain *iovad)

+{
+   cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+   >cpuhp_dead);
+   free_iova_rcaches(iovad);
+}
+
  /**
   * put_iova_domain - destroys the iova domain
   * @iovad: - iova domain in question.
@@ -497,9 +503,9 @@ void put_iova_domain(struct iova_domain *iovad)
  {
struct iova *iova, *tmp;
  
-	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,

-   >cpuhp_dead);
-   free_iova_rcaches(iovad);
+   if (iovad->rcaches)
+   iova_domain_free_rcaches(iovad);
+
rbtree_postorder_for_each_entry_safe(iova, tmp, >rbroot, node)
free_iova_mem(iova);
  }
@@ -608,6 +614,7 @@ EXPORT_SYMBOL_GPL(reserve_iova);
   */
  
  #define IOVA_MAG_SIZE 128

+#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
  
  struct iova_magazine {

unsigned long size;
@@ -620,6 +627,13 @@ struct iova_cpu_rcache {
struct iova_magazine *prev;
  };
  
+struct iova_rcache {

+   spinlock_t lock;
+   unsigned long depot_size;
+   struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+   struct iova_cpu_rcache __percpu *cpu_rcaches;
+};
+
  static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
  {
return kzalloc(sizeof(struct iova_magazine), flags);
@@ -693,28 +707,54 @@ static void iova_magazine_push(struct iova_magazine *mag, 
unsigned long pfn)
mag->pfns[mag->size++] = pfn;
  }
  
-static void init_iova_rcaches(struct iova_domain *iovad)

+int iova_domain_init_rcaches(struct iova_domain *iovad)
  {
-   struct iova_cpu_rcache *cpu_rcache;
-   struct iova_rcache *rcache;
unsigned int cpu;
-   int i;
+   int i, ret;
+
+   iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE,
+sizeof(struct iova_rcache),
+   

Re: [PATCH v2] iommu/iova: Separate out rcache init

2022-02-14 Thread Robin Murphy

On 2022-02-03 09:59, John Garry wrote:

Currently the rcache structures are allocated for all IOVA domains, even if
they do not use "fast" alloc+free interface. This is wasteful of memory.

In addition, fails in init_iova_rcaches() are not handled safely, which is
less than ideal.

Make "fast" users call a separate rcache init explicitly, which includes
error checking.


Reviewed-by: Robin Murphy 


Signed-off-by: John Garry 
---
Differences to v1:
- Drop stubs for iova_domain_init_rcaches() and iova_domain_free_rcaches()
- Use put_iova_domain() in vdpa code

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d85d54f2b549..b22034975301 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -525,6 +525,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
struct iova_domain *iovad;
+   int ret;
  
  	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)

return -EINVAL;
@@ -559,6 +560,9 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
}
  
  	init_iova_domain(iovad, 1UL << order, base_pfn);

+   ret = iova_domain_init_rcaches(iovad);
+   if (ret)
+   return ret;
  
  	/* If the FQ fails we can simply fall back to strict mode */

if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b28c9435b898..7e9c3a97c040 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -15,13 +15,14 @@
  /* The anchor node sits above the top of the usable address space */
  #define IOVA_ANCHOR   ~0UL
  
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */

+
  static bool iova_rcache_insert(struct iova_domain *iovad,
   unsigned long pfn,
   unsigned long size);
  static unsigned long iova_rcache_get(struct iova_domain *iovad,
 unsigned long size,
 unsigned long limit_pfn);
-static void init_iova_rcaches(struct iova_domain *iovad);
  static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
*iovad);
  static void free_iova_rcaches(struct iova_domain *iovad);
  
@@ -64,8 +65,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,

iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
rb_link_node(>anchor.node, NULL, >rbroot.rb_node);
rb_insert_color(>anchor.node, >rbroot);
-   cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, 
>cpuhp_dead);
-   init_iova_rcaches(iovad);
  }
  EXPORT_SYMBOL_GPL(init_iova_domain);
  
@@ -488,6 +487,13 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)

  }
  EXPORT_SYMBOL_GPL(free_iova_fast);
  
+static void iova_domain_free_rcaches(struct iova_domain *iovad)

+{
+   cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+   >cpuhp_dead);
+   free_iova_rcaches(iovad);
+}
+
  /**
   * put_iova_domain - destroys the iova domain
   * @iovad: - iova domain in question.
@@ -497,9 +503,9 @@ void put_iova_domain(struct iova_domain *iovad)
  {
struct iova *iova, *tmp;
  
-	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,

-   >cpuhp_dead);
-   free_iova_rcaches(iovad);
+   if (iovad->rcaches)
+   iova_domain_free_rcaches(iovad);
+
rbtree_postorder_for_each_entry_safe(iova, tmp, >rbroot, node)
free_iova_mem(iova);
  }
@@ -608,6 +614,7 @@ EXPORT_SYMBOL_GPL(reserve_iova);
   */
  
  #define IOVA_MAG_SIZE 128

+#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
  
  struct iova_magazine {

unsigned long size;
@@ -620,6 +627,13 @@ struct iova_cpu_rcache {
struct iova_magazine *prev;
  };
  
+struct iova_rcache {

+   spinlock_t lock;
+   unsigned long depot_size;
+   struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+   struct iova_cpu_rcache __percpu *cpu_rcaches;
+};
+
  static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
  {
return kzalloc(sizeof(struct iova_magazine), flags);
@@ -693,28 +707,54 @@ static void iova_magazine_push(struct iova_magazine *mag, 
unsigned long pfn)
mag->pfns[mag->size++] = pfn;
  }
  
-static void init_iova_rcaches(struct iova_domain *iovad)

+int iova_domain_init_rcaches(struct iova_domain *iovad)
  {
-   struct iova_cpu_rcache *cpu_rcache;
-   struct iova_rcache *rcache;
unsigned int cpu;
-   int i;
+   int i, ret;
+
+   iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE,
+sizeof(struct iova_rcache),
+   

Re: [PATCH v1 1/8] iommu: Add iommu_group_replace_domain()

2022-02-14 Thread Robin Murphy

On 2022-02-14 12:45, Jason Gunthorpe wrote:

On Mon, Feb 14, 2022 at 12:09:36PM +, Robin Murphy wrote:

On 2022-01-06 02:20, Lu Baolu wrote:

Expose an interface to replace the domain of an iommu group for frameworks
like vfio which claims the ownership of the whole iommu group.


But if the underlying point is the new expectation that
iommu_{attach,detach}_device() operate on the device's whole group where
relevant, why should we invent some special mechanism for VFIO to be
needlessly inconsistent?

I said before that it's trivial for VFIO to resolve a suitable device if it
needs to; by now I've actually written the patch ;)

https://gitlab.arm.com/linux-arm/linux-rm/-/commit/9f37d8c17c9b606abc96e1f1001c0b97c8b93ed5


Er, how does locking work there? What keeps busdev from being
concurrently unplugged?


Same thing that prevents the bus pointer from suddenly becoming invalid 
in the current code, I guess :)


But yes, holding a group reference alone can't prevent the group itself 
from changing, and the finer points of locking still need working out - 
there's a reason you got a link to a WIP branch in my tree rather than a 
proper patch in your inbox (TBH at the moment that one represents about 
a 5:1 ratio of time spent on the reasoning behind the commit message vs. 
the implementation itself).



How can iommu_group_get() be safely called on
this pointer?


VFIO hardly needs to retrieve the iommu_group from a device which it 
derived from the iommu_group it holds in the first place. What matters 
is being able to call *other* device-based IOMMU API interfaces in the 
long term. And once a robust solution for that is in place, it should 
inevitably work for a device-based attach interface too.



All of the above only works normally inside a probe/remove context
where the driver core is blocking concurrent unplug and descruction.

I think I said this last time you brought it up that lifetime was the
challenge with this idea.


Indeed, but it's a challenge that needs tackling, because the bus-based 
interfaces need to go away. So either we figure it out now and let this 
attach interface rework benefit immediately, or I spend three times as 
long solving it on my own and end up deleting 
iommu_group_replace_domain() in about 6 months' time anyway.


Thanks,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/ipmmu-vmsa: Check for error num after setting mask

2022-02-14 Thread Robin Murphy

On 2022-02-14 11:43, Joerg Roedel wrote:

Adding more potential reviewers.

On Thu, Jan 06, 2022 at 10:43:02AM +0800, Jiasheng Jiang wrote:

Because of the possible failure of the dma_supported(), the
dma_set_mask_and_coherent() may return error num.
Therefore, it should be better to check it and return the error if
fails.


In this particular case it cannot fail on any system the driver actually 
runs on - it's a platform device so the dma_mask pointer is always 
initialised, then dma_direct_supported() on arm64 will always return 
true for any mask wider than 32 bits, while arm_dma_supported() will 
also always pass since a 32-bit system cannot have memory above 40 bits 
either.


There's no great harm in adding the check for the sake of consistency, I 
guess, but it's purely cosmetic and not fixing anything.


Thanks,
Robin.


Fixes: 1c894225bf5b ("iommu/ipmmu-vmsa: IPMMU device is 40-bit bus master")
Signed-off-by: Jiasheng Jiang 
---
  drivers/iommu/ipmmu-vmsa.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index aaa6a4d59057..7df5da44a004 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -1003,7 +1003,9 @@ static int ipmmu_probe(struct platform_device *pdev)
bitmap_zero(mmu->ctx, IPMMU_CTX_MAX);
mmu->features = of_device_get_match_data(>dev);
memset(mmu->utlb_ctx, IPMMU_CTX_INVALID, mmu->features->num_utlbs);
-   dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(40));
+   ret = dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(40));
+   if (ret)
+   return ret;
  
  	/* Map I/O memory and request IRQ. */

res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
--
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 1/8] iommu: Add iommu_group_replace_domain()

2022-02-14 Thread Robin Murphy

On 2022-01-06 02:20, Lu Baolu wrote:

Expose an interface to replace the domain of an iommu group for frameworks
like vfio which claims the ownership of the whole iommu group.


But if the underlying point is the new expectation that 
iommu_{attach,detach}_device() operate on the device's whole group where 
relevant, why should we invent some special mechanism for VFIO to be 
needlessly inconsistent?


I said before that it's trivial for VFIO to resolve a suitable device if 
it needs to; by now I've actually written the patch ;)


https://gitlab.arm.com/linux-arm/linux-rm/-/commit/9f37d8c17c9b606abc96e1f1001c0b97c8b93ed5

Robin.


Signed-off-by: Lu Baolu 
---
  include/linux/iommu.h | 10 ++
  drivers/iommu/iommu.c | 37 +
  2 files changed, 47 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 408a6d2b3034..66ebce3d1e11 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -677,6 +677,9 @@ void iommu_device_unuse_dma_api(struct device *dev);
  int iommu_group_set_dma_owner(struct iommu_group *group, void *owner);
  void iommu_group_release_dma_owner(struct iommu_group *group);
  bool iommu_group_dma_owner_claimed(struct iommu_group *group);
+int iommu_group_replace_domain(struct iommu_group *group,
+  struct iommu_domain *old,
+  struct iommu_domain *new);
  
  #else /* CONFIG_IOMMU_API */
  
@@ -1090,6 +1093,13 @@ static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group)

  {
return false;
  }
+
+static inline int
+iommu_group_replace_domain(struct iommu_group *group, struct iommu_domain *old,
+  struct iommu_domain *new)
+{
+   return -ENODEV;
+}
  #endif /* CONFIG_IOMMU_API */
  
  /**

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 72a95dea688e..ab8ab95969f5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3431,3 +3431,40 @@ bool iommu_group_dma_owner_claimed(struct iommu_group 
*group)
return user;
  }
  EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
+
+/**
+ * iommu_group_replace_domain() - Replace group's domain
+ * @group: The group.
+ * @old: The previous attached domain. NULL for none.
+ * @new: The new domain about to be attached.
+ *
+ * This is to support backward compatibility for vfio which manages the dma
+ * ownership in iommu_group level.
+ */
+int iommu_group_replace_domain(struct iommu_group *group,
+  struct iommu_domain *old,
+  struct iommu_domain *new)
+{
+   int ret = 0;
+
+   mutex_lock(>mutex);
+   if (!group->owner || group->domain != old) {
+   ret = -EPERM;
+   goto unlock_out;
+   }
+
+   if (old)
+   __iommu_detach_group(old, group);
+
+   if (new) {
+   ret = __iommu_attach_group(new, group);
+   if (ret && old)
+   __iommu_attach_group(old, group);
+   }
+
+unlock_out:
+   mutex_unlock(>mutex);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_group_replace_domain);

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Error when running fio against nvme-of rdma target (mlx5 driver)

2022-02-11 Thread Robin Murphy

On 2022-02-10 23:58, Martin Oliveira wrote:

On 2/9/22 1:41 AM, Chaitanya Kulkarni wrote:

On 2/8/22 6:50 PM, Martin Oliveira wrote:

Hello,

We have been hitting an error when running IO over our nvme-of setup, using the 
mlx5 driver and we are wondering if anyone has seen anything similar/has any 
suggestions.

Both initiator and target are AMD EPYC 7502 machines connected over RDMA using 
a Mellanox MT28908. Target has 12 NVMe SSDs which are exposed as a single NVMe 
fabrics device, one physical SSD per namespace.



Thanks for reporting this, if you can bisect the problem on your setup
it will help others to help you better.

-ck


Hi Chaitanya,

I went back to a kernel as old as 4.15 and the problem was still there, so I 
don't know of a good commit to start from.

I also learned that I can reproduce this with as little as 3 cards and I 
updated the firmware on the Mellanox cards to the latest version.

I'd be happy to try any tests if someone has any suggestions.


The IOMMU is probably your friend here - one thing that might be worth 
trying is capturing the iommu:map and iommu:unmap tracepoints to see if 
the address reported in subsequent IOMMU faults was previously mapped as 
a valid DMA address (be warned that there will likely be a *lot* of 
trace generated). With 5.13 or newer, booting with "iommu.forcedac=1" 
should also make it easier to tell real DMA IOVAs from rogue physical 
addresses or other nonsense, as real DMA addresses should then look more 
like 0x24d08000.


That could at least help narrow down whether it's some kind of 
use-after-free race or a completely bogus address creeping in somehow.


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu: Remove trivial ops->capable implementations

2022-02-10 Thread Robin Murphy
Implementing ops->capable to always return false is pointless since it's
the default behaviour anyway. Clean up the unnecessary implementations.

Signed-off-by: Robin Murphy 
---

Spinning this out of my bus ops stuff (currently 30 patches and
counting...) since it would be better off alongside Baolu's cleanup
series to avoid conflicts, and I want to depend on those patches for
dev_iommu_ops() anyway.

 drivers/iommu/msm_iommu.c  | 6 --
 drivers/iommu/tegra-gart.c | 6 --
 drivers/iommu/tegra-smmu.c | 6 --
 3 files changed, 18 deletions(-)

diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 06bde6b66732..22061ddbd5df 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -558,11 +558,6 @@ static phys_addr_t msm_iommu_iova_to_phys(struct 
iommu_domain *domain,
return ret;
 }
 
-static bool msm_iommu_capable(enum iommu_cap cap)
-{
-   return false;
-}
-
 static void print_ctx_regs(void __iomem *base, int ctx)
 {
unsigned int fsr = GET_FSR(base, ctx);
@@ -672,7 +667,6 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id)
 }
 
 static struct iommu_ops msm_iommu_ops = {
-   .capable = msm_iommu_capable,
.domain_alloc = msm_iommu_domain_alloc,
.domain_free = msm_iommu_domain_free,
.attach_dev = msm_iommu_attach_dev,
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 6a358f92c7e5..bbd287d19324 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -238,11 +238,6 @@ static phys_addr_t gart_iommu_iova_to_phys(struct 
iommu_domain *domain,
return pte & GART_PAGE_MASK;
 }
 
-static bool gart_iommu_capable(enum iommu_cap cap)
-{
-   return false;
-}
-
 static struct iommu_device *gart_iommu_probe_device(struct device *dev)
 {
if (!dev_iommu_fwspec_get(dev))
@@ -276,7 +271,6 @@ static void gart_iommu_sync(struct iommu_domain *domain,
 }
 
 static const struct iommu_ops gart_iommu_ops = {
-   .capable= gart_iommu_capable,
.domain_alloc   = gart_iommu_domain_alloc,
.domain_free= gart_iommu_domain_free,
.attach_dev = gart_iommu_attach_dev,
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index e900e3c46903..43df44f918a1 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -272,11 +272,6 @@ static void tegra_smmu_free_asid(struct tegra_smmu *smmu, 
unsigned int id)
clear_bit(id, smmu->asids);
 }
 
-static bool tegra_smmu_capable(enum iommu_cap cap)
-{
-   return false;
-}
-
 static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
 {
struct tegra_smmu_as *as;
@@ -967,7 +962,6 @@ static int tegra_smmu_of_xlate(struct device *dev,
 }
 
 static const struct iommu_ops tegra_smmu_ops = {
-   .capable = tegra_smmu_capable,
.domain_alloc = tegra_smmu_domain_alloc,
.domain_free = tegra_smmu_domain_free,
.attach_dev = tegra_smmu_attach_dev,
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu: explicitly check for NULL in iommu_dma_get_resv_regions()

2022-02-10 Thread Robin Murphy

On 2022-02-09 14:09, Aleksandr Fedorov wrote:

iommu_dma_get_resv_regions() assumes that iommu_fwspec field for
corresponding device is set which is not always true.  Since
iommu_dma_get_resv_regions() seems to be a future-proof generic API
that can be used by any iommu driver, add an explicit check for NULL.


Except it's not a "generic" interface for drivers to call at random, 
it's a helper for retrieving common firmware-based information 
specifically for drivers already using the fwspec mechanism for common 
firmware bindings. If any driver calls this with a device *without* a 
valid fwnode, it deserves to crash because it's done something 
fundamentally wrong.


I concur that it's not exactly obvious that "non-IOMMU-specific" means 
"based on common firmware bindings, thus implying fwspec".


Robin.


Currently it can work by accident since compiler can eliminate
the 'iommu_fwspec' check altogether when CONFIG_ACPI_IORT=n, but
code elimination from optimizations is not reliable.

Signed-off-by: Aleksandr Fedorov 
---
A compilation failure has been observed on a gcc-compatible compiler based on 
EDG.

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d85d54f2b549..474b1b7211d7 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -382,10 +382,10 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
   */
  void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
  {
+   struct iommu_fwspec *iommu_fwspec = dev_iommu_fwspec_get(dev);
  
-	if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))

+   if (iommu_fwspec && !is_of_node(iommu_fwspec->iommu_fwnode))
iort_iommu_msi_get_resv_regions(dev, list);
-
  }
  EXPORT_SYMBOL(iommu_dma_get_resv_regions);
  
___

iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 08/10] iommu: Remove unused argument in is_attach_deferred

2022-02-09 Thread Robin Murphy

On 2022-02-08 01:25, Lu Baolu wrote:

The is_attach_deferred iommu_ops callback is a device op. The domain
argument is unnecessary and never used. Remove it to make code clean.

Suggested-by: Robin Murphy 
Signed-off-by: Lu Baolu 
---
  include/linux/iommu.h |  2 +-
  drivers/iommu/amd/amd_iommu.h |  3 +--
  drivers/iommu/amd/iommu.c |  3 +--
  drivers/iommu/amd/iommu_v2.c  |  2 +-
  drivers/iommu/intel/iommu.c   |  3 +--
  drivers/iommu/iommu.c | 15 ++-
  6 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index eb2684f95018..47ca7eca5d7b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -269,7 +269,7 @@ struct iommu_ops {
void (*put_resv_regions)(struct device *dev, struct list_head *list);
  
  	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);

-   bool (*is_attach_deferred)(struct iommu_domain *domain, struct device 
*dev);
+   bool (*is_attach_deferred)(struct device *dev);
  
  	/* Per device IOMMU features */

bool (*dev_has_feat)(struct device *dev, enum iommu_dev_features f);
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 416815a525d6..3b2f06b7aca6 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -116,8 +116,7 @@ void amd_iommu_domain_clr_pt_root(struct protection_domain 
*domain)
  
  
  extern bool translation_pre_enabled(struct amd_iommu *iommu);

-extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
-struct device *dev);
+extern bool amd_iommu_is_attach_deferred(struct device *dev);
  extern int __init add_special_device(u8 type, u8 id, u16 *devid,
 bool cmd_line);
  
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c

index 461f1844ed1f..37f2fbb4b129 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2215,8 +2215,7 @@ static void amd_iommu_get_resv_regions(struct device *dev,
list_add_tail(>list, head);
  }
  
-bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,

- struct device *dev)
+bool amd_iommu_is_attach_deferred(struct device *dev)
  {
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
  
diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c

index 58da08cc3d01..7c94ec05d289 100644
--- a/drivers/iommu/amd/iommu_v2.c
+++ b/drivers/iommu/amd/iommu_v2.c
@@ -537,7 +537,7 @@ static int ppr_notifier(struct notifier_block *nb, unsigned 
long e, void *data)
ret = NOTIFY_DONE;
  
  	/* In kdump kernel pci dev is not initialized yet -> send INVALID */

-   if (amd_iommu_is_attach_deferred(NULL, >dev)) {
+   if (amd_iommu_is_attach_deferred(>dev)) {
amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
   PPR_INVALID, tag);
goto out;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2b5f4e57a8bb..80f1294be634 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5052,8 +5052,7 @@ intel_iommu_dev_disable_feat(struct device *dev, enum 
iommu_dev_features feat)
}
  }
  
-static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,

-  struct device *dev)
+static bool intel_iommu_is_attach_deferred(struct device *dev)
  {
return attach_deferred(dev);


Seems like there's no need to wrap this now?

Robin.


  }
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de593a9cc09c..3dff30ddebdd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -825,13 +825,12 @@ static int iommu_create_device_direct_mappings(struct 
iommu_group *group,
return ret;
  }
  
-static bool iommu_is_attach_deferred(struct iommu_domain *domain,

-struct device *dev)
+static bool iommu_is_attach_deferred(struct device *dev)
  {
const struct iommu_ops *ops = dev_iommu_ops(dev);
  
  	if (ops->is_attach_deferred)

-   return ops->is_attach_deferred(domain, dev);
+   return ops->is_attach_deferred(dev);
  
  	return false;

  }
@@ -888,7 +887,7 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
  
  	mutex_lock(>mutex);

list_add_tail(>list, >devices);
-   if (group->domain  && !iommu_is_attach_deferred(group->domain, dev))
+   if (group->domain  && !iommu_is_attach_deferred(dev))
ret = __iommu_attach_device(group->domain, dev);
mutex_unlock(>mutex);
if (ret)
@@ -1743,7 +1742,7 @@ static int iommu_group_do_dma_attach(struct device *dev, 
void *data)
struct iommu_domain *domain = data;
int ret = 0;
  
-	if (!iommu_is_attach_deferred(domain, dev))

Re: Error when running fio against nvme-of rdma target (mlx5 driver)

2022-02-09 Thread Robin Murphy

On 2022-02-09 02:50, Martin Oliveira wrote:

Hello,

We have been hitting an error when running IO over our nvme-of setup, using the 
mlx5 driver and we are wondering if anyone has seen anything similar/has any 
suggestions.

Both initiator and target are AMD EPYC 7502 machines connected over RDMA using 
a Mellanox MT28908. Target has 12 NVMe SSDs which are exposed as a single NVMe 
fabrics device, one physical SSD per namespace.

When running an fio job targeting directly the fabrics devices (no filesystem, 
see script at the end), within a minute or so we start seeing errors like this:

[  408.368677] mlx5_core :c1:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT 
domain=0x002f address=0x24d08000 flags=0x]
[  408.372201] infiniband mlx5_0: mlx5_handle_error_cqe:332:(pid 0): WC error: 
4, Message: local protection error
[  408.380181] infiniband mlx5_0: dump_cqe:272:(pid 0): dump error cqe
[  408.380187] : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  408.380189] 0010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  408.380191] 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  408.380192] 0030: 00 00 00 00 a9 00 56 04 00 00 01 e9 00 54 e8 e2
[  408.380230] nvme nvme15: RECV for CQE 0xce392ed9 failed with status 
local protection error (4)
[  408.380235] nvme nvme15: starting error recovery
[  408.380238] nvme_ns_head_submit_bio: 726 callbacks suppressed
[  408.380246] block nvme15n2: no usable path - requeuing I/O
[  408.380284] block nvme15n5: no usable path - requeuing I/O
[  408.380298] block nvme15n1: no usable path - requeuing I/O
[  408.380304] block nvme15n11: no usable path - requeuing I/O
[  408.380304] block nvme15n11: no usable path - requeuing I/O
[  408.380330] block nvme15n1: no usable path - requeuing I/O
[  408.380350] block nvme15n2: no usable path - requeuing I/O
[  408.380371] block nvme15n6: no usable path - requeuing I/O
[  408.380377] block nvme15n6: no usable path - requeuing I/O
[  408.380382] block nvme15n4: no usable path - requeuing I/O
[  408.380472] mlx5_core :c1:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT 
domain=0x002f address=0x24d09000 flags=0x]
[  408.391265] mlx5_core :c1:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT 
domain=0x002f address=0x24d0a000 flags=0x]
[  415.125967] nvmet: ctrl 1 keep-alive timer (5 seconds) expired!
[  415.131898] nvmet: ctrl 1 fatal error occurred!

Occasionally, we've seen the following stack trace:


FWIW this is indicative the scatterlist passed to dma_unmap_sg_attrs() 
was wrong - specifically it looks like an attempt to unmap a region 
that's already unmapped (or was never mapped in the first place). 
Whatever race or data corruption issue is causing that is almost 
certainly happening much earlier, since the IO_PAGE_FAULT logs further 
imply that either some pages have been spuriously unmapped while the 
device was still accessing them, or some DMA address in the scatterlist 
was already bogus by the time it was handed off to the device.


Robin.


[ 1158.152464] kernel BUG at drivers/iommu/amd/io_pgtable.c:485!
[ 1158.427696] invalid opcode:  [#1] SMP NOPTI
[ 1158.432228] CPU: 51 PID: 796 Comm: kworker/51:1H Tainted: P   OE 
5.13.0-eid-athena-g6fb4e704d11c-dirty #14
[ 1158.443867] Hardware name: GIGABYTE R272-Z32-00/MZ32-AR0-00, BIOS R21 
10/08/2020
[ 1158.451252] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
[ 1158.456884] RIP: 0010:iommu_v1_unmap_page+0xed/0x100
[ 1158.461849] Code: 48 8b 45 d0 65 48 33 04 25 28 00 00 00 75 1d 48 83 c4 10 4c 89 
f0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 49 8d 46 ff 4c 85 f0 74 d6 <0f> 0b e8 1c 38 
46 00 66 66 2e 0f 1f 84 00 00 00 00 00 90 0f 1f 44
[ 1158.480589] RSP: 0018:abb520587bd0 EFLAGS: 00010206
[ 1158.485812] RAX: 000100061fff RBX: 0010 RCX: 0027
[ 1158.492938] RDX: 30562000 RSI:  RDI: 
[ 1158.500071] RBP: abb520587c08 R08: abb520587bd0 R09: 
[ 1158.507202] R10: 0001 R11: 000ff000 R12: 9984abd9e318
[ 1158.514326] R13: 9984abd9e310 R14: 000100062000 R15: 0001
[ 1158.521452] FS:  () GS:99a36c8c() 
knlGS:
[ 1158.529540] CS:  0010 DS:  ES:  CR0: 80050033
[ 1158.535286] CR2: 7f75b04f1000 CR3: 0001eddd8000 CR4: 00350ee0
[ 1158.542419] Call Trace:
[ 1158.544877]  amd_iommu_unmap+0x2c/0x40
[ 1158.548653]  __iommu_unmap+0xc4/0x170
[ 1158.552344]  iommu_unmap_fast+0xe/0x10
[ 1158.556100]  __iommu_dma_unmap+0x85/0x120
[ 1158.560115]  iommu_dma_unmap_sg+0x95/0x110
[ 1158.564213]  dma_unmap_sg_attrs+0x42/0x50
[ 1158.568225]  rdma_rw_ctx_destroy+0x6e/0xc0 [ib_core]
[ 1158.573201]  nvmet_rdma_rw_ctx_destroy+0xa7/0xc0 [nvmet_rdma]
[ 1158.578944]  nvmet_rdma_read_data_done+0x5c/0xf0 [nvmet_rdma]
[ 1158.584683]  __ib_process_cq+0x8e/0x150 [ib_core]
[ 1158.589398]  ib_cq_poll_work+0x2b/0x80 [ib_core]
[ 1158.594027]  

Re: [PATCH 1/2] iommu/arm-smmu: Use platform_irq_count() to get the interrupt count

2022-02-08 Thread Robin Murphy

On 2022-02-08 15:19, Will Deacon wrote:

On Thu, Dec 23, 2021 at 02:14:35PM +, Robin Murphy wrote:

On 2021-12-23 13:00, Lad Prabhakar wrote:

platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static
allocation of IRQ resources in DT core code, this causes an issue
when using hierarchical interrupt domains using "interrupts" property
in the node as this bypasses the hierarchical setup and messes up the
irq chaining.

In preparation for removal of static setup of IRQ resource from DT core
code use platform_get_irq_count().


Nit: platform_irq_count()


Signed-off-by: Lad Prabhakar 
---
   drivers/iommu/arm/arm-smmu/arm-smmu.c | 12 ++--
   1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4bc75c4ce402..4844cd075644 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2105,12 +2105,12 @@ static int arm_smmu_device_probe(struct platform_device 
*pdev)
if (IS_ERR(smmu))
return PTR_ERR(smmu);
-   num_irqs = 0;
-   while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
-   num_irqs++;
-   if (num_irqs > smmu->num_global_irqs)
-   smmu->num_context_irqs++;
-   }
+   num_irqs = platform_irq_count(pdev);
+   if (num_irqs < 0)
+   return num_irqs;
+
+   if (num_irqs > smmu->num_global_irqs)
+   smmu->num_context_irqs += (num_irqs - smmu->num_global_irqs);


This seems a bit overcomplicated. I reckon:

smmu->num_context_irqs = num_irqs - smmu->num_global_irqs;
if (num_irqs <= smmu->num_global_irqs) {
dev_err(...

should do it.

However, FYI I have some patches refactoring most of the IRQ stuff here that
I plan to post next cycle (didn't quite have time to get them done for 5.17
as I'd hoped...), so unless this needs to go in right now as an urgent fix,
I'm happy to take care of removing platform_get_resource() as part of that
if it's easier.


Did you get anywhere with this? December 23rd is long forgotten by now ;)


Yup: 
https://gitlab.arm.com/linux-arm/linux-rm/-/commit/b2a40caaf1622eb35c555074a0d72f4f0513cff9


I'm still cleaning up the PMU driver that justifies the whole thing, but 
I can send that patch now if you reckon it's not complete nonsense out 
of context. Otherwise, I'll aim to get the whole lot out next week.



Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/1] iommu/dma: Use DMA ops setter instead of direct assignment

2022-02-07 Thread Robin Murphy

On 2022-02-07 14:13, Andy Shevchenko wrote:

Use DMA ops setter instead of direct assignment. Even we know that
this module doesn't perform access to the dma_ops member of struct device,
it's better to use setter to avoid potential problems in the future.


What potential problems are you imagining? This whole file is a DMA ops 
implementation, not driver code (and definitely not a module); if anyone 
removes the "select DMA_OPS" from CONFIG_IOMMU_DMA they deserve whatever 
breakage they get.


I concur that there's no major harm in using the helper here, but I also 
see no point in pretending that there's any value to abstracting the 
operation in this particular context.


Thanks,
Robin.


Signed-off-by: Andy Shevchenko 
---
v2: rebased on top of the latest codebase
  drivers/iommu/dma-iommu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d85d54f2b549..b585a3fdbc56 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1482,7 +1482,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 
dma_base, u64 dma_limit)
if (iommu_is_dma_domain(domain)) {
if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
goto out_err;
-   dev->dma_ops = _dma_ops;
+   set_dma_ops(dev, _dma_ops);
}
  
  	return;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/2] iommu/arm-smmu-v3: Simplify memory allocation

2022-02-07 Thread Robin Murphy

On 2022-02-05 17:11, Christophe JAILLET wrote:

Use devm_bitmap_zalloc() instead of hand writing it.


Heh, that reminds me that I have more or less the same patch sat locally 
somewhere, except IIRC I took it further and removed the unhelpful error 
message and pruned the local variables as well - I think that would 
still be my preference here (or I could dig out my patch and post it if 
you like).


Cheers,
Robin.


Signed-off-by: Christophe JAILLET 
---
This is NOT compile tested.
I don't have the needed cross compiling tools.
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 10 ++
  1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 14d06aad0726..ba0e7f1f7dbf 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2911,12 +2911,6 @@ static int arm_smmu_init_one_queue(struct 
arm_smmu_device *smmu,
return 0;
  }
  
-static void arm_smmu_cmdq_free_bitmap(void *data)

-{
-   unsigned long *bitmap = data;
-   bitmap_free(bitmap);
-}
-
  static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
  {
int ret = 0;
@@ -2927,13 +2921,13 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device 
*smmu)
atomic_set(>owner_prod, 0);
atomic_set(>lock, 0);
  
-	bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);

+   bitmap = (atomic_long_t *)devm_bitmap_zalloc(smmu->dev, nents,
+GFP_KERNEL);
if (!bitmap) {
dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
ret = -ENOMEM;
} else {
cmdq->valid_map = bitmap;
-   devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
}
  
  	return ret;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/2] iommu/arm-smmu-v3: Avoid open coded arithmetic in memory allocation

2022-02-07 Thread Robin Murphy

On 2022-02-05 17:10, Christophe JAILLET wrote:

kmalloc_array()/kcalloc() should be used to avoid potential overflow when
a multiplication is needed to compute the size of the requested memory.

So turn a devm_kzalloc()+explicit size computation into an equivalent
devm_kcalloc().

Signed-off-by: Christophe JAILLET 
---
This is NOT compile tested.
I don't have the needed cross compiling tools.


FYI, https://cdn.kernel.org/pub/tools/crosstool/

Either way, the patch looks reasonable, thanks!

Acked-by: Robin Murphy 


---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6dc6d8b6b368..14d06aad0726 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2981,10 +2981,10 @@ static int arm_smmu_init_l1_strtab(struct 
arm_smmu_device *smmu)
  {
unsigned int i;
struct arm_smmu_strtab_cfg *cfg = >strtab_cfg;
-   size_t size = sizeof(*cfg->l1_desc) * cfg->num_l1_ents;
void *strtab = smmu->strtab_cfg.strtab;
  
-	cfg->l1_desc = devm_kzalloc(smmu->dev, size, GFP_KERNEL);

+   cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents,
+   sizeof(*cfg->l1_desc), GFP_KERNEL);
if (!cfg->l1_desc)
return -ENOMEM;
  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/rockchip: : Use standard driver registration

2022-02-04 Thread Robin Murphy
It's been a long time since there was any reason to register IOMMU
drivers early. Convert to the standard platform driver helper.

CC: Heiko Stuebner 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/rockchip-iommu.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 7f23ad61c094..204a93a72572 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -1407,9 +1407,4 @@ static struct platform_driver rk_iommu_driver = {
   .suppress_bind_attrs = true,
},
 };
-
-static int __init rk_iommu_init(void)
-{
-   return platform_driver_register(_iommu_driver);
-}
-subsys_initcall(rk_iommu_init);
+builtin_platform_driver(rk_iommu_driver);
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/msm: Use standard driver registration

2022-02-04 Thread Robin Murphy
It's been a long time since there was any reason to register IOMMU
drivers early. Convert to the standard platform driver helper.

CC: Andy Gross 
CC: Bjorn Andersson 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/msm_iommu.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 3a38352b603f..06bde6b66732 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -831,16 +831,4 @@ static struct platform_driver msm_iommu_driver = {
.probe  = msm_iommu_probe,
.remove = msm_iommu_remove,
 };
-
-static int __init msm_iommu_driver_init(void)
-{
-   int ret;
-
-   ret = platform_driver_register(_iommu_driver);
-   if (ret != 0)
-   pr_err("Failed to register IOMMU driver\n");
-
-   return ret;
-}
-subsys_initcall(msm_iommu_driver_init);
-
+builtin_platform_driver(msm_iommu_driver);
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: iommu non-strict mode for arm64

2022-02-04 Thread Robin Murphy

On 2022-02-04 05:46, Josh Poimboeuf wrote:

Hi all,

We've gotten significant slowdowns on arm64 with 4k pages compared to
64k.  The slowdowns can be alleviated by setting iommu.strict=0 or
iommu.passthrough=1.

Is there a reason x86 defaults to lazy iommu, while arm64 does not?  Are
there security implications which are specific to arm64?


The x86 behaviour is basically 2 decades of legacy where nobody now 
feels brave enough to flip the default. At the time the arm64 IOMMU DMA 
ops were first added, strict mode was the only thing feasible to 
implement, but there was also a conscious consideration that having a 
default assumption of "IOMMU == more protection" wasn't a bad thing 
anyway. Given what played out a couple of years later, and everyone now 
being that much more security-aware, I think that decision has only been 
reinforced.


Passthrough and non-strict mode in iommu-dma only came along later, and 
most IOMMU drivers for arm64 still don't support them, which is another 
reason I'm still against changing the default today. However, if you're 
confident that your arm64 users care more about high-bandwidth I/O 
throughput than memory protection then feel free to set 
IOMMU_DEFAULT_DMA_LAZY or IOMMU_DEFAULT_PASSTHROUGH in your config.


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v10 2/6] dt-bindings: gpio: logicvc: Add a compatible with major version only

2022-02-02 Thread Robin Murphy

On 2022-01-30 00:46, Linus Walleij wrote:

On Thu, Jan 20, 2022 at 4:00 PM Paul Kocialkowski
 wrote:


There are lots of different versions of the logicvc block and it
makes little sense to list them all in compatibles since all versions
with the same major are found to be register-compatible.


The reason we try to be precise is because sometime, long after the driver
has been merged and maintained for a few years, a bug is discovered
in a specific version of the silicon.

What happens is that a fix is applied on all silicon whether it is needed
or not.

If you have the precise silicon compatible, you can avoid this and target
only a specific version.


Indeed, the better approach would be something like:

  compatible:
oneOf:
  - items:
  - enum:
  - foo,bar-v1.0
  - foo,bar,v1.1
  - const: foo,bar-v1
  - items:
  - enum:
  - foo,bar-v2.0
  - const: foo,bar-v2

That way the DTs are future-proof, while drivers can still match on only 
the less-specific strings until a need arises. Plus it avoids the 
problem that if an existing OS that only understands "foo,bar-v1.0" is 
given a new DT with only "foo,bar-v1" for v1.0 hardware it won't be able 
to use the device, even though it's *functionally* capable of doing so.


However, from skimming patch #5, it looks possible that none of these 
changes are needed at all. If LOGICVC_IP_VERSION_REG tells you the exact 
revision, and is always present (as the unconditional reading of it 
implies), then the only reason for adding new compatibles would be if, 
say, v5 has more clocks from v4 and you want the binding to enforce 
that; otherwise, newer versions are literally compatible with the 
currently-defined binding and therefore should continue to bind against 
the existing string(s) to maximise forward- and backward-compatibility. 
Sure, it's not the prettiest thing for a "generic" compatible to be 
based on an oddly-specific version number that doesn't necessarily match 
the actual software-discoverable version, but what's done is done and 
that's the cost of ABI.


Cheers,
Robin.

(also, nitpick for that part of patch #5 since I'm here: please include 
linux/bitfield.h rather than reinventing FIELD_GET() locally)


Re: [PATCH 0/2] x86/hyperv/Swiotlb: Add swiotlb_alloc_from_low_pages switch

2022-02-02 Thread Robin Murphy

On 2022-02-02 08:12, Christoph Hellwig wrote:

I think this interface is a little too hacky.  In the end all the
non-trusted hypervisor schemes (including the per-device swiotlb one)
can allocate the memory from everywhere and want for force use of
swiotlb.  I think we need some kind of proper interface for that instead
of setting all kinds of global variables.


Right, if platform/hypervisor code knows enough to be able to set magic 
non-standard allocation flags correctly, surely it could equally just 
perform whatever non-standard allocation it needs and call 
swiotlb_init_with_tbl() instead of swiotlb_init().


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/2] iommu/vt-d: Fix PCI bus rescan device hot add

2022-01-31 Thread Robin Murphy

On 2022-01-30 07:43, Joerg Roedel wrote:

Hi Jacob, Baolu,

On Fri, Jan 28, 2022 at 11:10:01AM +0800, Lu Baolu wrote:

During PCI bus rescan, adding new devices involve two notifiers.
1. dmar_pci_bus_notifier()
2. iommu_bus_notifier()
The current code sets #1 as low priority (INT_MIN) which resulted in #2
being invoked first. The result is that struct device pointer cannot be
found in DRHD search for the new device's DMAR/IOMMU. Subsequently, the
device is put under the "catch-all" IOMMU instead of the correct one.


There are actually iommu_ops pointers invoked from iommu_bus_notifier()
into IOMMU driver code. Can those be used to enforce the ordering in a
more reliable way?


Indeed I very nearly asked whether we couldn't just call these from 
intel_iommu_{probe,release}_device() directly, but it looks like they 
also interact with the interrupt remapping stuff which can be built 
independently of the IOMMU API :(


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/iova: Separate out rcache init

2022-01-28 Thread Robin Murphy

On 2022-01-28 11:32, John Garry wrote:

On 26/01/2022 17:00, Robin Murphy wrote:

As above, I vote for just forward-declaring the free routine in iova.c
and keeping it entirely private.


BTW, speaking of forward declarations, it's possible to remove all the 
forward declarations in iova.c now that the FQ code is gone - but with a 
good bit of rearranging. However I am not sure how much people care 
about that or whether the code layout is sane...


Indeed, I was very tempted to raise the question there of whether there 
was any more cleanup or refactoring that could be done to justify 
collecting all the rcache code together at the top of iova.c. But in the 
end I didn't, so my opinion still remains a secret...


Robin.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] iommu/iova: Separate out rcache init

2022-01-28 Thread Robin Murphy

On 2022-01-28 11:32, John Garry wrote:

On 26/01/2022 17:00, Robin Murphy wrote:

As above, I vote for just forward-declaring the free routine in iova.c
and keeping it entirely private.


BTW, speaking of forward declarations, it's possible to remove all the 
forward declarations in iova.c now that the FQ code is gone - but with a 
good bit of rearranging. However I am not sure how much people care 
about that or whether the code layout is sane...


Indeed, I was very tempted to raise the question there of whether there 
was any more cleanup or refactoring that could be done to justify 
collecting all the rcache code together at the top of iova.c. But in the 
end I didn't, so my opinion still remains a secret...


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu: Fix some W=1 warnings

2022-01-28 Thread Robin Murphy

On 2022-01-28 10:44, John Garry wrote:

The code is mostly free of W=1 warning, so fix the following:

drivers/iommu/iommu.c:996: warning: expecting prototype for 
iommu_group_for_each_dev(). Prototype was for __iommu_group_for_each_dev() 
instead
drivers/iommu/iommu.c:3048: warning: Function parameter or member 'drvdata' not 
described in 'iommu_sva_bind_device'
drivers/iommu/ioasid.c:354: warning: Function parameter or member 'ioasid' not 
described in 'ioasid_get'
drivers/iommu/omap-iommu.c:1098: warning: expecting prototype for 
omap_iommu_suspend_prepare(). Prototype was for omap_iommu_prepare() instead


Certainly no harm in keeping the documentation up to date!

Reviewed-by: Robin Murphy 


Signed-off-by: John Garry 

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 50ee27bbd04e..06fee7416816 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(ioasid_alloc);
  
  /**

   * ioasid_get - obtain a reference to the IOASID
+ * @ioasid: the ID to get
   */
  void ioasid_get(ioasid_t ioasid)
  {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8b86406b7162..75741ce748d5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -980,17 +980,6 @@ static int iommu_group_device_count(struct iommu_group 
*group)
return ret;
  }
  
-/**

- * iommu_group_for_each_dev - iterate over each device in the group
- * @group: the group
- * @data: caller opaque data to be passed to callback function
- * @fn: caller supplied callback function
- *
- * This function is called by group users to iterate over group devices.
- * Callers should hold a reference count to the group during callback.
- * The group->mutex is held across callbacks, which will block calls to
- * iommu_group_add/remove_device.
- */
  static int __iommu_group_for_each_dev(struct iommu_group *group, void *data,
  int (*fn)(struct device *, void *))
  {
@@ -1005,7 +994,17 @@ static int __iommu_group_for_each_dev(struct iommu_group 
*group, void *data,
return ret;
  }
  
-

+/**
+ * iommu_group_for_each_dev - iterate over each device in the group
+ * @group: the group
+ * @data: caller opaque data to be passed to callback function
+ * @fn: caller supplied callback function
+ *
+ * This function is called by group users to iterate over group devices.
+ * Callers should hold a reference count to the group during callback.
+ * The group->mutex is held across callbacks, which will block calls to
+ * iommu_group_add/remove_device.
+ */
  int iommu_group_for_each_dev(struct iommu_group *group, void *data,
 int (*fn)(struct device *, void *))
  {
@@ -3032,6 +3031,7 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
   * iommu_sva_bind_device() - Bind a process address space to a device
   * @dev: the device
   * @mm: the mm to bind, caller must hold a reference to it
+ * @drvdata: opaque data pointer to pass to bind callback
   *
   * Create a bond between device and address space, allowing the device to 
access
   * the mm using the returned PASID. If a bond already exists between @device 
and
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 91749654fd49..980e4af3f06b 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1085,7 +1085,7 @@ static __maybe_unused int 
omap_iommu_runtime_resume(struct device *dev)
  }
  
  /**

- * omap_iommu_suspend_prepare - prepare() dev_pm_ops implementation
+ * omap_iommu_prepare - prepare() dev_pm_ops implementation
   * @dev:  iommu device
   *
   * This function performs the necessary checks to determine if the IOMMU

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] component: Add common helpers for compare/release functions

2022-01-28 Thread Robin Murphy

On 2022-01-28 08:11, Yong Wu wrote:
[...]

diff --git a/include/linux/component.h b/include/linux/component.h
index 16de18f473d7..5a7468ea827c 100644
--- a/include/linux/component.h
+++ b/include/linux/component.h
@@ -2,6 +2,8 @@
  #ifndef COMPONENT_H
  #define COMPONENT_H
  
+#include 

+#include 
  #include 
  
  
@@ -82,6 +84,22 @@ struct component_master_ops {

void (*unbind)(struct device *master);
  };
  
+/* A set common helpers for compare/release functions */

+static inline int compare_of(struct device *dev, void *data)
+{
+   return dev->of_node == data;
+}


Note that this is effectively just device_match_of_node(), although I 
guess there is an argument that having a nice consistent set of 
component_match API helpers might be worth more than a tiny code saving 
by borrowing one from a different API.


Either way, however, I don't think there's any good argument for 
instantiating separate copies of these functions in every driver that 
uses them. If they're used as callbacks then they can't actually be 
inlined anyway, so they may as well be exported from component.c as 
normal so that the code really is shared (plus then there's nice 
symmetry with the aforementioned device_match API helpers too).


Thanks,
Robin.


+static inline void release_of(struct device *dev, void *data)
+{
+   of_node_put(data);
+}
+
+static inline int compare_dev(struct device *dev, void *data)
+{
+   return dev == data;
+}
+
  void component_master_del(struct device *,
const struct component_master_ops *);
  
diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c

index eff200a07d9f..992132cbfb9f 100644
--- a/sound/soc/codecs/wcd938x.c
+++ b/sound/soc/codecs/wcd938x.c
@@ -4417,16 +4417,6 @@ static const struct component_master_ops 
wcd938x_comp_ops = {
.unbind = wcd938x_unbind,
  };
  
-static int wcd938x_compare_of(struct device *dev, void *data)

-{
-   return dev->of_node == data;
-}
-
-static void wcd938x_release_of(struct device *dev, void *data)
-{
-   of_node_put(data);
-}
-
  static int wcd938x_add_slave_components(struct wcd938x_priv *wcd938x,
struct device *dev,
struct component_match **matchptr)
@@ -4442,8 +4432,7 @@ static int wcd938x_add_slave_components(struct 
wcd938x_priv *wcd938x,
}
  
  	of_node_get(wcd938x->rxnode);

-   component_match_add_release(dev, matchptr, wcd938x_release_of,
-   wcd938x_compare_of, wcd938x->rxnode);
+   component_match_add_release(dev, matchptr, release_of, compare_of, 
wcd938x->rxnode);
  
  	wcd938x->txnode = of_parse_phandle(np, "qcom,tx-device", 0);

if (!wcd938x->txnode) {
@@ -4451,8 +4440,7 @@ static int wcd938x_add_slave_components(struct 
wcd938x_priv *wcd938x,
return -ENODEV;
}
of_node_get(wcd938x->txnode);
-   component_match_add_release(dev, matchptr, wcd938x_release_of,
-   wcd938x_compare_of, wcd938x->txnode);
+   component_match_add_release(dev, matchptr, release_of, compare_of, 
wcd938x->txnode);
return 0;
  }
  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH] component: Add common helpers for compare/release functions

2022-01-28 Thread Robin Murphy

On 2022-01-28 08:11, Yong Wu wrote:
[...]

diff --git a/include/linux/component.h b/include/linux/component.h
index 16de18f473d7..5a7468ea827c 100644
--- a/include/linux/component.h
+++ b/include/linux/component.h
@@ -2,6 +2,8 @@
  #ifndef COMPONENT_H
  #define COMPONENT_H
  
+#include 

+#include 
  #include 
  
  
@@ -82,6 +84,22 @@ struct component_master_ops {

void (*unbind)(struct device *master);
  };
  
+/* A set common helpers for compare/release functions */

+static inline int compare_of(struct device *dev, void *data)
+{
+   return dev->of_node == data;
+}


Note that this is effectively just device_match_of_node(), although I 
guess there is an argument that having a nice consistent set of 
component_match API helpers might be worth more than a tiny code saving 
by borrowing one from a different API.


Either way, however, I don't think there's any good argument for 
instantiating separate copies of these functions in every driver that 
uses them. If they're used as callbacks then they can't actually be 
inlined anyway, so they may as well be exported from component.c as 
normal so that the code really is shared (plus then there's nice 
symmetry with the aforementioned device_match API helpers too).


Thanks,
Robin.


+static inline void release_of(struct device *dev, void *data)
+{
+   of_node_put(data);
+}
+
+static inline int compare_dev(struct device *dev, void *data)
+{
+   return dev == data;
+}
+
  void component_master_del(struct device *,
const struct component_master_ops *);
  
diff --git a/sound/soc/codecs/wcd938x.c b/sound/soc/codecs/wcd938x.c

index eff200a07d9f..992132cbfb9f 100644
--- a/sound/soc/codecs/wcd938x.c
+++ b/sound/soc/codecs/wcd938x.c
@@ -4417,16 +4417,6 @@ static const struct component_master_ops 
wcd938x_comp_ops = {
.unbind = wcd938x_unbind,
  };
  
-static int wcd938x_compare_of(struct device *dev, void *data)

-{
-   return dev->of_node == data;
-}
-
-static void wcd938x_release_of(struct device *dev, void *data)
-{
-   of_node_put(data);
-}
-
  static int wcd938x_add_slave_components(struct wcd938x_priv *wcd938x,
struct device *dev,
struct component_match **matchptr)
@@ -4442,8 +4432,7 @@ static int wcd938x_add_slave_components(struct 
wcd938x_priv *wcd938x,
}
  
  	of_node_get(wcd938x->rxnode);

-   component_match_add_release(dev, matchptr, wcd938x_release_of,
-   wcd938x_compare_of, wcd938x->rxnode);
+   component_match_add_release(dev, matchptr, release_of, compare_of, 
wcd938x->rxnode);
  
  	wcd938x->txnode = of_parse_phandle(np, "qcom,tx-device", 0);

if (!wcd938x->txnode) {
@@ -4451,8 +4440,7 @@ static int wcd938x_add_slave_components(struct 
wcd938x_priv *wcd938x,
return -ENODEV;
}
of_node_get(wcd938x->txnode);
-   component_match_add_release(dev, matchptr, wcd938x_release_of,
-   wcd938x_compare_of, wcd938x->txnode);
+   component_match_add_release(dev, matchptr, release_of, compare_of, 
wcd938x->txnode);
return 0;
  }
  


Re: [PATCH 21/27] arm64: dts: rockchip: rk356x: Add HDMI nodes

2022-01-26 Thread Robin Murphy

On 2022-01-26 18:44, Peter Geis wrote:

On Wed, Jan 26, 2022 at 12:56 PM Robin Murphy  wrote:


On 2022-01-26 16:04, Peter Geis wrote:

On Wed, Jan 26, 2022 at 9:58 AM Sascha Hauer  wrote:


Add support for the HDMI port found on RK3568.

Signed-off-by: Sascha Hauer 
---
   arch/arm64/boot/dts/rockchip/rk356x.dtsi | 37 +++-
   1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi 
b/arch/arm64/boot/dts/rockchip/rk356x.dtsi
index 4008bd666d01..e38fb223e9b8 100644
--- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi
@@ -10,7 +10,6 @@
   #include 
   #include 
   #include 
-#include 
   #include 

   / {
@@ -502,6 +501,42 @@ vop_mmu: iommu@fe043e00 {
  status = "disabled";
  };

+   hdmi: hdmi@fe0a {
+   compatible = "rockchip,rk3568-dw-hdmi";
+   reg = <0x0 0xfe0a 0x0 0x2>;
+   interrupts = ;
+   clocks = < PCLK_HDMI_HOST>,
+< CLK_HDMI_SFR>,
+< CLK_HDMI_CEC>,
+< CLK_HDMI_REF>,
+< HCLK_VOP>;
+   clock-names = "iahb", "isfr", "cec", "ref", "hclk";
+   pinctrl-names = "default";
+   pinctrl-0 = <_scl _sda _cec>;


I looked into CEC support here, and it seems that it does work with one change.
Please add the two following lines to your patch:
assigned-clocks = < CLK_HDMI_CEC>;
assigned-clock-rates = <32768>;

The issue is the clk_rtc32k_frac clock that feeds clk_rtc_32k which
feeds clk_hdmi_cec is 24mhz at boot, which is too high for CEC to
function.


Wouldn't it make far more sense to just stick a suitable clk_set_rate()
call in the driver? AFAICS it's already explicitly aware of the CEC clock.


This is handled purely in the
drivers/gpu/drm/bridge/synopsys/dw-hdmi.c driver, so I'm hesitant to
touch it there as it would affect all users, not just Rockchip.


I'd have a strong hunch that it's a standard thing for the DesignWare IP 
and not affected by platform integration. I don't have the magical 
Synopsys databook, but between the trusty old i.MX6 manual and most of 
the other in-tree DTs getting their dw-hdmi "cec" clock from 
suspiciously-obviously-named sources, I'd be somewhat surprised if it 
was ever anything other than 32KHz.


Robin.


Could someone familiar with the dw-hdmi IP weigh in on the minimum and
maximum clock rate the CEC block can handle?



Robin.


+   power-domains = < RK3568_PD_VO>;
+   reg-io-width = <4>;
+   rockchip,grf = <>;
+   #sound-dai-cells = <0>;
+   status = "disabled";
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   hdmi_in: port@0 {
+   reg = <0>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };
+
+   hdmi_out: port@1 {
+   reg = <1>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };
+   };
+   };
+
  qos_gpu: qos@fe128000 {
  compatible = "rockchip,rk3568-qos", "syscon";
  reg = <0x0 0xfe128000 0x0 0x20>;
--
2.30.2



___
Linux-rockchip mailing list
linux-rockc...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-rockchip


Re: [PATCH 21/27] arm64: dts: rockchip: rk356x: Add HDMI nodes

2022-01-26 Thread Robin Murphy

On 2022-01-26 16:04, Peter Geis wrote:

On Wed, Jan 26, 2022 at 9:58 AM Sascha Hauer  wrote:


Add support for the HDMI port found on RK3568.

Signed-off-by: Sascha Hauer 
---
  arch/arm64/boot/dts/rockchip/rk356x.dtsi | 37 +++-
  1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi 
b/arch/arm64/boot/dts/rockchip/rk356x.dtsi
index 4008bd666d01..e38fb223e9b8 100644
--- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi
@@ -10,7 +10,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 

  / {
@@ -502,6 +501,42 @@ vop_mmu: iommu@fe043e00 {
 status = "disabled";
 };

+   hdmi: hdmi@fe0a {
+   compatible = "rockchip,rk3568-dw-hdmi";
+   reg = <0x0 0xfe0a 0x0 0x2>;
+   interrupts = ;
+   clocks = < PCLK_HDMI_HOST>,
+< CLK_HDMI_SFR>,
+< CLK_HDMI_CEC>,
+< CLK_HDMI_REF>,
+< HCLK_VOP>;
+   clock-names = "iahb", "isfr", "cec", "ref", "hclk";
+   pinctrl-names = "default";
+   pinctrl-0 = <_scl _sda _cec>;


I looked into CEC support here, and it seems that it does work with one change.
Please add the two following lines to your patch:
assigned-clocks = < CLK_HDMI_CEC>;
assigned-clock-rates = <32768>;

The issue is the clk_rtc32k_frac clock that feeds clk_rtc_32k which
feeds clk_hdmi_cec is 24mhz at boot, which is too high for CEC to
function.


Wouldn't it make far more sense to just stick a suitable clk_set_rate() 
call in the driver? AFAICS it's already explicitly aware of the CEC clock.


Robin.


+   power-domains = < RK3568_PD_VO>;
+   reg-io-width = <4>;
+   rockchip,grf = <>;
+   #sound-dai-cells = <0>;
+   status = "disabled";
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   hdmi_in: port@0 {
+   reg = <0>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };
+
+   hdmi_out: port@1 {
+   reg = <1>;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };
+   };
+   };
+
 qos_gpu: qos@fe128000 {
 compatible = "rockchip,rk3568-qos", "syscon";
 reg = <0x0 0xfe128000 0x0 0x20>;
--
2.30.2



___
Linux-rockchip mailing list
linux-rockc...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-rockchip


Re: [PATCH] iommu/iova: Separate out rcache init

2022-01-26 Thread Robin Murphy

On 2022-01-26 13:55, John Garry wrote:

Currently the rcache structures are allocated for all IOVA domains, even if
they do not use "fast" alloc+free interface. This is wasteful of memory.

In addition, fails in init_iova_rcaches() are not handled safely, which is
less than ideal.

Make "fast" users call a separate rcache init explicitly, which includes
error checking.

Signed-off-by: John Garry 


Mangled patch? (no "---" separator here)

Overall this looks great, just a few comments further down...


diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3a46f2cc9e5d..dd066d990809 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -525,6 +525,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
struct iova_domain *iovad;
+   int ret;
  
  	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)

return -EINVAL;
@@ -559,6 +560,9 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
}
  
  	init_iova_domain(iovad, 1UL << order, base_pfn);

+   ret = iova_domain_init_rcaches(iovad);
+   if (ret)
+   return ret;
  
  	/* If the FQ fails we can simply fall back to strict mode */

if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b28c9435b898..d3adc6ea5710 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -15,13 +15,14 @@
  /* The anchor node sits above the top of the usable address space */
  #define IOVA_ANCHOR   ~0UL
  
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */

+
  static bool iova_rcache_insert(struct iova_domain *iovad,
   unsigned long pfn,
   unsigned long size);
  static unsigned long iova_rcache_get(struct iova_domain *iovad,
 unsigned long size,
 unsigned long limit_pfn);
-static void init_iova_rcaches(struct iova_domain *iovad);
  static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
*iovad);
  static void free_iova_rcaches(struct iova_domain *iovad);
  
@@ -64,8 +65,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,

iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
rb_link_node(>anchor.node, NULL, >rbroot.rb_node);
rb_insert_color(>anchor.node, >rbroot);
-   cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, 
>cpuhp_dead);
-   init_iova_rcaches(iovad);
  }
  EXPORT_SYMBOL_GPL(init_iova_domain);
  
@@ -497,9 +496,9 @@ void put_iova_domain(struct iova_domain *iovad)

  {
struct iova *iova, *tmp;
  
-	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,

-   >cpuhp_dead);
-   free_iova_rcaches(iovad);
+   if (iovad->rcaches)
+   iova_domain_free_rcaches(iovad);
+
rbtree_postorder_for_each_entry_safe(iova, tmp, >rbroot, node)
free_iova_mem(iova);
  }
@@ -608,6 +607,7 @@ EXPORT_SYMBOL_GPL(reserve_iova);
   */
  
  #define IOVA_MAG_SIZE 128

+#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
  
  struct iova_magazine {

unsigned long size;
@@ -620,6 +620,13 @@ struct iova_cpu_rcache {
struct iova_magazine *prev;
  };
  
+struct iova_rcache {

+   spinlock_t lock;
+   unsigned long depot_size;
+   struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+   struct iova_cpu_rcache __percpu *cpu_rcaches;
+};
+
  static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
  {
return kzalloc(sizeof(struct iova_magazine), flags);
@@ -693,28 +700,62 @@ static void iova_magazine_push(struct iova_magazine *mag, 
unsigned long pfn)
mag->pfns[mag->size++] = pfn;
  }
  
-static void init_iova_rcaches(struct iova_domain *iovad)

+int iova_domain_init_rcaches(struct iova_domain *iovad)
  {
-   struct iova_cpu_rcache *cpu_rcache;
-   struct iova_rcache *rcache;
unsigned int cpu;
-   int i;
+   int i, ret;
+
+   iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE,
+sizeof(struct iova_rcache),
+GFP_KERNEL);
+   if (!iovad->rcaches)
+   return -ENOMEM;
  
  	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {

+   struct iova_cpu_rcache *cpu_rcache;
+   struct iova_rcache *rcache;
+
rcache = >rcaches[i];
spin_lock_init(>lock);
rcache->depot_size = 0;
-   rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), 
cache_line_size());
-   if (WARN_ON(!rcache->cpu_rcaches))
-   continue;
+   rcache->cpu_rcaches = 

Re: [PATCH] iommu/iova: Separate out rcache init

2022-01-26 Thread Robin Murphy

On 2022-01-26 13:55, John Garry wrote:

Currently the rcache structures are allocated for all IOVA domains, even if
they do not use "fast" alloc+free interface. This is wasteful of memory.

In addition, fails in init_iova_rcaches() are not handled safely, which is
less than ideal.

Make "fast" users call a separate rcache init explicitly, which includes
error checking.

Signed-off-by: John Garry 


Mangled patch? (no "---" separator here)

Overall this looks great, just a few comments further down...


diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3a46f2cc9e5d..dd066d990809 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -525,6 +525,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
struct iova_domain *iovad;
+   int ret;
  
  	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)

return -EINVAL;
@@ -559,6 +560,9 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
}
  
  	init_iova_domain(iovad, 1UL << order, base_pfn);

+   ret = iova_domain_init_rcaches(iovad);
+   if (ret)
+   return ret;
  
  	/* If the FQ fails we can simply fall back to strict mode */

if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b28c9435b898..d3adc6ea5710 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -15,13 +15,14 @@
  /* The anchor node sits above the top of the usable address space */
  #define IOVA_ANCHOR   ~0UL
  
+#define IOVA_RANGE_CACHE_MAX_SIZE 6	/* log of max cached IOVA range size (in pages) */

+
  static bool iova_rcache_insert(struct iova_domain *iovad,
   unsigned long pfn,
   unsigned long size);
  static unsigned long iova_rcache_get(struct iova_domain *iovad,
 unsigned long size,
 unsigned long limit_pfn);
-static void init_iova_rcaches(struct iova_domain *iovad);
  static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
*iovad);
  static void free_iova_rcaches(struct iova_domain *iovad);
  
@@ -64,8 +65,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,

iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
rb_link_node(>anchor.node, NULL, >rbroot.rb_node);
rb_insert_color(>anchor.node, >rbroot);
-   cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, 
>cpuhp_dead);
-   init_iova_rcaches(iovad);
  }
  EXPORT_SYMBOL_GPL(init_iova_domain);
  
@@ -497,9 +496,9 @@ void put_iova_domain(struct iova_domain *iovad)

  {
struct iova *iova, *tmp;
  
-	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,

-   >cpuhp_dead);
-   free_iova_rcaches(iovad);
+   if (iovad->rcaches)
+   iova_domain_free_rcaches(iovad);
+
rbtree_postorder_for_each_entry_safe(iova, tmp, >rbroot, node)
free_iova_mem(iova);
  }
@@ -608,6 +607,7 @@ EXPORT_SYMBOL_GPL(reserve_iova);
   */
  
  #define IOVA_MAG_SIZE 128

+#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
  
  struct iova_magazine {

unsigned long size;
@@ -620,6 +620,13 @@ struct iova_cpu_rcache {
struct iova_magazine *prev;
  };
  
+struct iova_rcache {

+   spinlock_t lock;
+   unsigned long depot_size;
+   struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+   struct iova_cpu_rcache __percpu *cpu_rcaches;
+};
+
  static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
  {
return kzalloc(sizeof(struct iova_magazine), flags);
@@ -693,28 +700,62 @@ static void iova_magazine_push(struct iova_magazine *mag, 
unsigned long pfn)
mag->pfns[mag->size++] = pfn;
  }
  
-static void init_iova_rcaches(struct iova_domain *iovad)

+int iova_domain_init_rcaches(struct iova_domain *iovad)
  {
-   struct iova_cpu_rcache *cpu_rcache;
-   struct iova_rcache *rcache;
unsigned int cpu;
-   int i;
+   int i, ret;
+
+   iovad->rcaches = kcalloc(IOVA_RANGE_CACHE_MAX_SIZE,
+sizeof(struct iova_rcache),
+GFP_KERNEL);
+   if (!iovad->rcaches)
+   return -ENOMEM;
  
  	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {

+   struct iova_cpu_rcache *cpu_rcache;
+   struct iova_rcache *rcache;
+
rcache = >rcaches[i];
spin_lock_init(>lock);
rcache->depot_size = 0;
-   rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), 
cache_line_size());
-   if (WARN_ON(!rcache->cpu_rcaches))
-   continue;
+   rcache->cpu_rcaches = 

Re: [PATCH v3 1/2] dt-bindings: arm: xen: document Xen iommu device

2022-01-26 Thread Robin Murphy

On 2022-01-26 15:09, Sergiy Kibrik wrote:

Hi Robin,



This could break Linux guests, since depending on the deferred probe
timeout setting it could lead to drivers never probing because the "IOMMU"
never becomes available.



I've noticed no deferred probe timeouts when booting with this patch. Could you 
please explain more on how this would break guests?


Right now I think it would actually require command-line intervention, 
e.g. "fw_devlink=on" or "deferred_probe_timeout=3600" (with modules 
enabled for the latter to take full effect), but I'm wary of the 
potential for future config options to control those behaviours by default.


Robin.


Thank you!

  -- Sergiy




Re: [PATCH 0/7] iommu cleanup and refactoring

2022-01-26 Thread Robin Murphy

On 2022-01-26 13:27, Jason Gunthorpe via iommu wrote:

On Wed, Jan 26, 2022 at 09:51:36AM +0800, Lu Baolu wrote:


they are fundamentally different things in their own right, and the ideal
API should give us the orthogonality to also bind a device to an SVA domain
without PASID (e.g. for KVM stage 2, or userspace assignment of simpler
fault/stall-tolerant devices), or attach PASIDs to regular iommu_domains.


Yes, these are orthogonal things. A iommu driver that supports PASID
ideally should support PASID enabled attach/detatch for every
iommu_domain type it supports.

SVA should not be entangled with PASID beyond that SVA is often used
with PASID - a SVA iommu_domain should be fully usable with a RID too.


The prototype of PASID enabled attach/detach ops could look like:

int (*attach_dev_pasid)(struct iommu_domain *domain,
struct device *dev, ioasid_t id);
void (*detach_dev_pasid)(struct iommu_domain *domain,
 struct device *dev, ioasid_t id);


It seems reasonable and straightforward to me..

These would be domain ops?
  

But the iommu driver should implement different callbacks for

1) attaching an IOMMU DMA domain to a PASID on device;
- kernel DMA with PASID
- mdev-like device passthrough
- etc.
2) attaching a CPU-shared domain to a PASID on device;
- SVA
- guest PASID
- etc.


But this you mean domain->ops would be different? Seems fine, up to
the driver.


Indeed, it makes little practical difference whether the driver provides 
distinct sets of ops or just common callbacks which switch on the domain 
type internally. I expect that's largely going to come down to personal 
preference and how much internal driver code is common between the paths.



I'd hope to see some flow like:

  domain = device->bus->iommu_ops->alloc_sva_domain(dev)
  domain->ops->attach_dev_pasid(domain, dev, current->pasid)

To duplicate the current SVA APIs


+1 - I'd concur that attach/detach belong as domain ops in general. 
There's quite a nice logical split forming here where domain ops are the 
ones that make sense for external subsystems and endpoint drivers to use 
too, while device ops, with the sole exception of domain_alloc, are 
IOMMU API internals. By that reasoning, attach/bind/etc. are firmly in 
the former category.


Thanks,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC][DISCUSSION] dma-mapping: allocating noncoherent buffer without mapping

2022-01-26 Thread Robin Murphy

On 2022-01-26 11:54, Hyeonggon Yoo wrote:

Last month we discussed drivers that uses kmalloc(GFP_DMA) for
noncoherent mapping should be converted to use DMA API [1]. Simple
grep with GFP_DMA shows that many of drivers are mistakenly
using GFP_DMA flag.

So our purpose was to make DMA API choose right zone depending on
device's dma mask. Baoquan and I are trying to make drivers to
use dma_alloc_noncoherent() when allocating the buffer.

But it's not simple for some of drivers; there is a gap between
dma_alloc_noncoherent() and the previous convention (allocating buffer
from buddy or slab allocator and mapping it when needed.)

For example, some drivers allocate buffer and reuse it. it just maps
and unmaps whenever needed. And some drivers does not even maps the
buffer. (some drivers that does not use DMA API)

So I think we need to implement a version of dma_alloc_noncoherent()
that does not map the buffer.


This doesn't make sense to me. The point of dma_alloc_* is you get back 
a dedicated DMA buffer which can then be used at any time. In the 
noncoherent case you have to put in explicit dma_sync_*() calls around 
accesses when the CPU or device is expected to have updated the buffer 
contents, but it's still fundamentally the same paradigm.


If you want to update drivers from using streaming mappings on 
specially-allocated pages to using proper dedicated DMA buffers, then 
update the logic in those drivers to use dedicated DMA buffers 
appropriately. Adding an API to allocate a DMA buffer which isn't 
actually a DMA buffer so we can "update" drivers from misusing streaming 
mappings to still misusing streaming mappings is nonsense.


Robin.


I think implementing a helper that internally calls
__dma_direct_alloc_pages() will be okay.

As I'm not expert in this area, I want to hear
others' opinions.

[1] https://lkml.org/lkml/2021/12/13/1121

Thanks,
Hyeonggon.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[Test002] test-X2-RJPM

2022-01-26 Thread Robin Murphy
Test email 2


[PES Logo]
Robin Murphy
Press & Communications Officer
Mobile: +32 491 86 97 48
robin.mur...@pes.eu<mailto:robin.mur...@pes.eu>
Party of European Socialists
10-12 Rue Guimard
B-1040 Brussels
Belgium
www.pes.eu<http://www.pes.eu/>
twitter.com/pes_pse<https://twitter.com/pes_pse>
facebook.com/pes.pse<https://www.facebook.com/PES.PSE/>
instagram.com/pes_pse<https://www.instagram.com/pes_pse/>




PES Women - Inaction in 2022 will cost women’s freedoms, safety and lives.docx
Description:  PES Women - Inaction in 2022 will cost women’s freedoms, safety and lives.docx


[Test001] test-X2-RJPM

2022-01-26 Thread Robin Murphy
Test email 2


[PES Logo]
Robin Murphy
Press & Communications Officer
Mobile: +32 491 86 97 48
robin.mur...@pes.eu<mailto:robin.mur...@pes.eu>
Party of European Socialists
10-12 Rue Guimard
B-1040 Brussels
Belgium
www.pes.eu<http://www.pes.eu/>
twitter.com/pes_pse<https://twitter.com/pes_pse>
facebook.com/pes.pse<https://www.facebook.com/PES.PSE/>
instagram.com/pes_pse<https://www.instagram.com/pes_pse/>




PES Women - Inaction in 2022 will cost women’s freedoms, safety and lives.docx
Description:  PES Women - Inaction in 2022 will cost women’s freedoms, safety and lives.docx


Re: [PATCH] [RFC] fbcon: Add option to enable legacy hardware acceleration

2022-01-25 Thread Robin Murphy

On 2022-01-25 19:12, Helge Deller wrote:

Add a config option CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION to
enable bitblt and fillrect hardware acceleration in the framebuffer
console. If disabled, such acceleration will not be used, even if it is
supported by the graphics hardware driver.

If you plan to use DRM as your main graphics output system, you should
disable this option since it will prevent compiling in code which isn't
used later on when DRM takes over.

For all other configurations, e.g. if none of your graphic cards support
DRM (yet), DRM isn't available for your architecture, or you can't be
sure that the graphic card in the target system will support DRM, you
most likely want to enable this option.


This patch is the first RFC.


Independed of this patch I did some timing experiments with a qemu
virtual machine running a PA-RISC Debian Linux installation with a
screen resolution of 2048x1024 with 8bpp. In that case qemu emulated the
graphics hardware bitblt and fillrect acceleration by using the native
(x86) CPU.

It was a simple testcase which was to run "time dmesg", where the syslog
had 284 lines. The results showed a huge speedup:
a) time dmesg (without acceleration):
-> 19.0 seconds
b) time dmesg (with acceleration):
->  2.6 seconds

Signed-off-by: Helge Deller 

diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index 840d9813b0bc..da84d1c21c21 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -78,6 +78,17 @@ config FRAMEBUFFER_CONSOLE
help
  Low-level framebuffer-based console driver.

+config FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION
+   bool "Framebuffer Console hardware acceleration support"
+   depends on FRAMEBUFFER_CONSOLE
+   default y if !DRM
+   default y if !(X86 || ARM)


You probably mean ARM64 there, if you're trying to encapsulate "modern 
systems where this really isn't relevant". Some supported ARM platforms 
do date back to the days of weird and wonderful fbdev hardware.


Conversely I recently cobbled an ancient PCI VGA card into an arm64 
machine and was slightly disappointed that there didn't seem to be any 
driver that was usable straight off :)


(Yes, I might give v86d + uvesafb a go at some point...)

Robin.


+   help
+ If you use a system on which DRM is fully supported you usually want 
to say N,
+ otherwise you probably want to enable this option.
+ If enabled the framebuffer console will utilize the hardware 
acceleration
+ of your graphics card by using hardware bitblt and fillrect features.
+
  config FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
 bool "Map the console to the primary display device"
 depends on FRAMEBUFFER_CONSOLE
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index b813985f1403..f7b7d35953e8 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -1136,11 +1136,13 @@ static void fbcon_init(struct vc_data *vc, int init)

ops->graphics = 0;

+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION
if ((cap & FBINFO_HWACCEL_COPYAREA) &&
!(cap & FBINFO_HWACCEL_DISABLED))
p->scrollmode = SCROLL_MOVE;
else /* default to something safe */
p->scrollmode = SCROLL_REDRAW;
+#endif

/*
 *  ++guenther: console.c:vc_allocate() relies on initializing
@@ -1705,7 +1707,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int 
t, unsigned int b,
count = vc->vc_rows;
if (logo_shown >= 0)
goto redraw_up;
-   switch (p->scrollmode) {
+   switch (fb_scrollmode(p)) {
case SCROLL_MOVE:
fbcon_redraw_blit(vc, info, p, t, b - t - count,
 count);
@@ -1795,7 +1797,7 @@ static bool fbcon_scroll(struct vc_data *vc, unsigned int 
t, unsigned int b,
count = vc->vc_rows;
if (logo_shown >= 0)
goto redraw_down;
-   switch (p->scrollmode) {
+   switch (fb_scrollmode(p)) {
case SCROLL_MOVE:
fbcon_redraw_blit(vc, info, p, b - 1, b - t - count,
 -count);
@@ -1946,12 +1948,12 @@ static void fbcon_bmove_rec(struct vc_data *vc, struct 
fbcon_display *p, int sy,
   height, width);
  }

-static void updatescrollmode(struct fbcon_display *p,
+static void updatescrollmode_accel(struct fbcon_display *p,
struct fb_info *info,
struct vc_data *vc)
  {
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION
struct fbcon_ops *ops = info->fbcon_par;
-   int fh = vc->vc_font.height;
int cap = info->flags;
u16 t = 0;
int ypan = 

Re: [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node

2022-01-25 Thread Robin Murphy

On 2022-01-25 13:00, Shameerali Kolothum Thodi wrote:

Hi Robin/Lorenzo,


-Original Message-
From: iommu [mailto:iommu-boun...@lists.linux-foundation.org] On Behalf
Of Shameer Kolothum
Sent: 05 August 2021 09:07
To: linux-arm-ker...@lists.infradead.org; linux-a...@vger.kernel.org;
iommu@lists.linux-foundation.org
Cc: robin.mur...@arm.com; j...@solid-run.com; Linuxarm
; steven.pr...@arm.com; Guohanjun (Hanjun Guo)
; yangyicong ;
sami.muja...@arm.com; w...@kernel.org; wanghuiqiang

Subject: [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node

Hi,

The series adds support to IORT RMR nodes specified in IORT
Revision E.b -ARM DEN 0049E[0]. RMR nodes are used to describe
memory ranges that are used by endpoints and require a unity
mapping in SMMU.

We have faced issues with 3408iMR RAID controller cards which
fail to boot when SMMU is enabled. This is because these
controllers make use of host memory for various caching related
purposes and when SMMU is enabled the iMR firmware fails to
access these memory regions as there is no mapping for them.
IORT RMR provides a way for UEFI to describe and report these
memory regions so that the kernel can make a unity mapping for
these in SMMU.

Change History:

v6 --> v7

The only change from v6 is the fix pointed out by Steve to
the SMMUv2 SMR bypass install in patch #8.

Thanks to the Tested-by tags by Laurentiu with SMMUv2 and
Hanjun/Huiqiang with SMMUv3 for v6. I haven't added the tags
yet as the series still needs more review[1].

Feedback and tests on this series is very much appreciated.


Since we have an update to IORT spec(E.c) now[1] and includes additional
attributes/flags for the RMR node, I am planning to respin this series soon.

Going through the new spec, I have a few queries,

The memory range attributes can now be described as one of the following,

0x00: Device-nGnRnE memory
0x01: Device-nGnRE memory
0x02: Device-nGRE memory
0x03: Device-GRE memory
0x04: Normal Inner Non-cacheable Outer Non-cacheable
0x05: Normal Inner Write-back Outer Write-back Inner Shareable

I am not sure how this needs to be captured and used in the kernel. Is there
any intention of using these fine-grained attributes in the kernel now
or a generic mapping of the above to the struct iommu_rev_region prot field
is enough? i.e., something like,

{
 
 prot = IOMMU_READ | IOMMU_WRITE;

 if (rmr_attr == normal_mem) // 0x05
 prot |= IOMMU_CACHE;
 
 if (rmr_attr == device_mem) { //0x00 - 0x03

 prot |= IOMMU_MMIO;
 prot |= IOMMU_NOEXEC;
 }
 
}


Yup, pretty much that, except don't bother with IOMMU_NOEXEC. We can't 
reliably infer it - e.g. on an AXI-based interconnect AxCACHE and AxPROT 
are entirely orthogonal, so a Device-type read with the "Instruction 
access" hint is perfectly legal - and in the common IORT code we're not 
in a position to second-guess what any given RMR might represent for 
whatever agent is accessing it.


All we can reasonably do here is map the Device types to IOMMU_MMIO and 
Write-back to IOMMU_CACHE, and if anyone ever does want to insist that 
that's not sufficient, then they're welcome to send patches to make the 
IOMMU API more expressive :)



Similarly for the 'flags' field, the new 'Access Privilege' is intended to set 
the
IOMMU_PRIV ?


Yes, exactly!

Cheers,
Robin.

   
Please let me know.


Thanks,
Shameer

[1] https://developer.arm.com/documentation/den0049/ec/?lang=en


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/7] iommu cleanup and refactoring

2022-01-25 Thread Robin Murphy

On 2022-01-24 17:44, Jason Gunthorpe wrote:

On Mon, Jan 24, 2022 at 09:46:26AM +, Tian, Kevin wrote:

From: Lu Baolu 
Sent: Monday, January 24, 2022 3:11 PM

Hi,

The guest pasid and aux-domain related code are dead code in current
iommu subtree. As we have reached a consensus that all these features
should be based on the new iommufd framework (which is under active
development), the first part of this series removes and cleanups all
the dead code.

The second part of this series refactors the iommu_domain by moving all
domain-specific ops from iommu_ops to a new domain_ops. This makes an
iommu_domain self-contained and represent the abstraction of an I/O
translation table in the IOMMU subsystem. With different type of
iommu_domain providing different set of ops, it's easier to support more
types of I/O translation tables.


You may want to give more background on this end goal. In general there
are four IOPT types in iommufd discussions:

1) The one currently tracked by iommu_domain, with a map/unmap semantics
2) The one managed by mm and shared to iommu via sva_bind/unbind ops
3) The one managed by userspace and bound to iommu via iommufd (require nesting)
4) The one managed by KVM (e.g. EPT) and shared to iommu via a TBD interface


Yes, at least from an iommufd perspective I'd like to see one struct
for all of these types, mainly so we can have a uniform alloc, attach
and detatch flow for all io page table types.


Agreed, certainly an IOMMU_DOMAIN_SVA type that can both encapsulate the 
mm and effectively replace iommu_sva seems like a logical and fairly 
small next step. We already have the paradigm of different domain types 
supporting different ops, so initially an SVA domain would simply allow 
bind/unbind rather than attach/detach/map/unmap.


It might then further be possible to hide SVA bind/unbind behind the 
attach/detach interface, but AFAICS we'd still need distinct flows for 
attaching/binding the whole device vs. attaching/binding to a PASID, 
since they are fundamentally different things in their own right, and 
the ideal API should give us the orthogonality to also bind a device to 
an SVA domain without PASID (e.g. for KVM stage 2, or userspace 
assignment of simpler fault/stall-tolerant devices), or attach PASIDs to 
regular iommu_domains.


That distinction could of course be consolidated by flipping to the 
other approach of explicitly allocating the PASID first, then wrapping 
it in a struct device that could then be passed through the same 
attach/detach interfaces and distinguished internally, but although I 
still have a fondness for that approach I know I'm about the only one :)


Cheers,
Robin.


If we want to use the iommu_domain, or make iommu_domain a sub-class
of a new struct, can be determined as we go along.

Regardless, I think this cleanup stands on its own. Moving the ops and
purging the dead code is clearly the right thing to do.

Thanks,
Jason

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 7/7] iommu: Add iommu_domain::domain_ops

2022-01-25 Thread Robin Murphy

On 2022-01-25 06:27, Lu Baolu wrote:

On 1/25/22 8:57 AM, Robin Murphy wrote:

On 2022-01-24 07:11, Lu Baolu wrote:

Add a domain specific callback set, domain_ops, for vendor iommu driver
to provide domain specific operations. Move domain-specific callbacks
from iommu_ops to the domain_ops and hook them when a domain is 
allocated.


I think it would make a lot of sense for iommu_domain_ops to be a 
substructure *within* iommu_ops. That way we can save most of the 
driver boilerplate here by not needing extra variables everywhere, and 
letting iommu_domain_alloc() still do a default initialisation like 
"domain->ops = bus->iommu_ops->domain_ops;"


In the new model, iommu_domain_ops and iommu_ops are not 1:1 mapped.
For example, a PASID-capable IOMMU could support DMA domain (which
supports map/unmap), SVA domain (which does not), and others in the
future. Different type of domain has its own domain_ops.


Sure, it's clear that that's the direction in which this is headed, and 
as I say I'm quite excited about that. However there are a couple of 
points I think are really worth considering:


Where it's just about which operations are valid for which domains, it's 
even simpler for the core interface wrappers to validate the domain 
type, rather than forcing drivers to implement multiple ops structures 
purely for the sake of having different callbacks populated. We already 
have this in places, e.g. where iommu_map() checks for 
__IOMMU_DOMAIN_PAGING.


Paging domains are also effectively the baseline level of IOMMU API 
functionality. All drivers support them, and for the majority of drivers 
it's all they will ever support. Those drivers really don't benefit from 
any of the churn and boilerplate in this patch as-is, and it's so easy 
to compromise with a couple of lines of core code to handle the common 
case by default when the driver *isn't* one of the handful which ever 
actually cares to install their own per-domain ops. Consider how much 
cleaner this patch would look if the typical driver diff could be 
something completely minimal like this:


->8-
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index be22fcf988ce..6aff493e37ee 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -514,12 +514,14 @@ static int mtk_iommu_hw_init(const struct 
mtk_iommu_data *data)


 static const struct iommu_ops mtk_iommu_ops = {
.domain_alloc   = mtk_iommu_domain_alloc,
-   .domain_free= mtk_iommu_domain_free,
-   .attach_dev = mtk_iommu_attach_device,
-   .detach_dev = mtk_iommu_detach_device,
-   .map= mtk_iommu_map,
-   .unmap  = mtk_iommu_unmap,
-   .iova_to_phys   = mtk_iommu_iova_to_phys,
+   .domain_ops = &(const struct iommu_domain_ops){
+   .domain_free= mtk_iommu_domain_free,
+   .attach_dev = mtk_iommu_attach_device,
+   .detach_dev = mtk_iommu_detach_device,
+   .map= mtk_iommu_map,
+   .unmap  = mtk_iommu_unmap,
+   .iova_to_phys   = mtk_iommu_iova_to_phys,
+   }
.probe_device   = mtk_iommu_probe_device,
.probe_finalize = mtk_iommu_probe_finalize,
.release_device = mtk_iommu_release_device,

-8<-

And of course I have to shy away from calling it default_domain_ops, 
since although it's logically default ops for domains, it is not 
specifically ops for default domains, sigh... :)


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 7/7] iommu: Add iommu_domain::domain_ops

2022-01-24 Thread Robin Murphy

On 2022-01-24 07:11, Lu Baolu wrote:

Add a domain specific callback set, domain_ops, for vendor iommu driver
to provide domain specific operations. Move domain-specific callbacks
from iommu_ops to the domain_ops and hook them when a domain is allocated.


I think it would make a lot of sense for iommu_domain_ops to be a 
substructure *within* iommu_ops. That way we can save most of the driver 
boilerplate here by not needing extra variables everywhere, and letting 
iommu_domain_alloc() still do a default initialisation like "domain->ops 
= bus->iommu_ops->domain_ops;"


I do like the future possibility of trying to flatten some of the 
io-pgtable indirection by having the SMMU drivers subsequently swizzle 
in alternate domain ops once they've picked a format, though. That was a 
bit too clunky to achieve at the whole iommu_ops level when I 
experimented with it years ago, but now it might well be worth another 
try...


One other comment below.


Signed-off-by: Lu Baolu 
---
  include/linux/iommu.h   | 93 -
  drivers/iommu/amd/iommu.c   | 21 +++--
  drivers/iommu/apple-dart.c  | 24 --
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 +++--
  drivers/iommu/arm/arm-smmu/arm-smmu.c   | 23 +++--
  drivers/iommu/arm/arm-smmu/qcom_iommu.c | 17 ++--
  drivers/iommu/exynos-iommu.c| 17 ++--
  drivers/iommu/fsl_pamu_domain.c | 13 ++-
  drivers/iommu/intel/iommu.c | 25 --
  drivers/iommu/iommu.c   | 15 ++--
  drivers/iommu/ipmmu-vmsa.c  | 21 +++--
  drivers/iommu/msm_iommu.c   | 17 ++--
  drivers/iommu/mtk_iommu.c   | 24 --
  drivers/iommu/mtk_iommu_v1.c| 19 +++--
  drivers/iommu/omap-iommu.c  | 15 ++--
  drivers/iommu/rockchip-iommu.c  | 17 ++--
  drivers/iommu/s390-iommu.c  | 15 ++--
  drivers/iommu/sprd-iommu.c  | 19 +++--
  drivers/iommu/sun50i-iommu.c| 18 ++--
  drivers/iommu/tegra-gart.c  | 15 ++--
  drivers/iommu/tegra-smmu.c  | 16 ++--
  drivers/iommu/virtio-iommu.c| 18 ++--
  22 files changed, 305 insertions(+), 179 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 111b3e9c79bb..33c5c0e5c465 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -88,7 +88,7 @@ struct iommu_domain_geometry {
  
  struct iommu_domain {

unsigned type;
-   const struct iommu_ops *ops;
+   const struct domain_ops *ops;
unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */
iommu_fault_handler_t handler;
void *handler_token;
@@ -192,26 +192,11 @@ struct iommu_iotlb_gather {
   * struct iommu_ops - iommu ops and capabilities
   * @capable: check capability
   * @domain_alloc: allocate iommu domain
- * @domain_free: free iommu domain
- * @attach_dev: attach device to an iommu domain
- * @detach_dev: detach device from an iommu domain
- * @map: map a physically contiguous memory region to an iommu domain
- * @map_pages: map a physically contiguous set of pages of the same size to
- * an iommu domain.
- * @unmap: unmap a physically contiguous memory region from an iommu domain
- * @unmap_pages: unmap a number of pages of the same size from an iommu domain
- * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
- * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
- * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
- *queue
- * @iova_to_phys: translate iova to physical address
   * @probe_device: Add device to iommu driver handling
   * @release_device: Remove device from iommu driver handling
   * @probe_finalize: Do final setup work after the device is added to an IOMMU
   *  group and attached to the groups domain
   * @device_group: find iommu group for a particular device
- * @enable_nesting: Enable nesting
- * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
   * @get_resv_regions: Request list of reserved regions for a device
   * @put_resv_regions: Free list of reserved regions for a device
   * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -237,33 +222,11 @@ struct iommu_ops {
  
  	/* Domain allocation and freeing by the iommu driver */

struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
-   void (*domain_free)(struct iommu_domain *);
  
-	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);

-   void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
-   int (*map)(struct iommu_domain *domain, unsigned long iova,
-  phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
-   int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
- 

Re: [PATCH 6/7] iommu: Use right way to retrieve iommu_ops

2022-01-24 Thread Robin Murphy

On 2022-01-24 07:11, Lu Baolu wrote:

The common iommu_ops is hooked to both device and domain. When a helper
has both device and domain pointer, the way to get the iommu_ops looks
messy in iommu core. This sorts out the way to get iommu_ops. The device
related helpers go through device pointer, while the domain related ones
go through domain pointer.

Signed-off-by: Lu Baolu 
---
  include/linux/iommu.h |  8 
  drivers/iommu/iommu.c | 25 ++---
  2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index aa5486243892..111b3e9c79bb 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -385,6 +385,14 @@ static inline void iommu_iotlb_gather_init(struct 
iommu_iotlb_gather *gather)
};
  }
  
+static inline const struct iommu_ops *dev_iommu_ops_get(struct device *dev)

+{
+   if (dev && dev->iommu && dev->iommu->iommu_dev)
+   return dev->iommu->iommu_dev->ops;
+
+   return NULL;


This probably warrants at least a WARN, but it's arguable to just assume 
that valid ops must be installed if iommu_probe_device() has succeeded. 
The device ops are essentially for internal use within the IOMMU 
subsystem itself, so we should be able to trust ourselves not to misuse 
the helper.



+}
+
  #define IOMMU_BUS_NOTIFY_PRIORITY 0
  #define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */
  #define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5230c6d90ece..6631e2ea44df 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(iommu_group_set_name);
  static int iommu_create_device_direct_mappings(struct iommu_group *group,
   struct device *dev)
  {
+   const struct iommu_ops *ops = dev_iommu_ops_get(dev);
struct iommu_domain *domain = group->default_domain;
struct iommu_resv_region *entry;
struct list_head mappings;
@@ -785,8 +786,8 @@ static int iommu_create_device_direct_mappings(struct 
iommu_group *group,
dma_addr_t start, end, addr;
size_t map_size = 0;
  
-		if (domain->ops->apply_resv_region)

-   domain->ops->apply_resv_region(dev, domain, entry);
+   if (ops->apply_resv_region)
+   ops->apply_resv_region(dev, domain, entry);


Strictly I think this was a domain op, as it was about reserving the 
IOVA range in the given DMA domain. Also taking the domain as an 
argument is a bit of a giveaway. However it's now just dead code either 
way since there are no remaining implementations, and no reason for any 
new ones.


  
  		start = ALIGN(entry->start, pg_size);

end   = ALIGN(entry->start + entry->length, pg_size);
@@ -831,8 +832,10 @@ static int iommu_create_device_direct_mappings(struct 
iommu_group *group,
  static bool iommu_is_attach_deferred(struct iommu_domain *domain,
 struct device *dev)
  {
-   if (domain->ops->is_attach_deferred)
-   return domain->ops->is_attach_deferred(domain, dev);
+   const struct iommu_ops *ops = dev_iommu_ops_get(dev);
+
+   if (ops->is_attach_deferred)
+   return ops->is_attach_deferred(domain, dev);


Similarly if this takes a domain as its first argument then it's de 
facto a domain method. However, I'd concur that logically it *is* a 
device op, so let's drop that (unused) domain argument if we're cleaning up.


Maybe there's even an argument for factoring this out to a standard flag 
in dev_iommu rather than an op at all?


The others covered here look OK - we can blame PCI for page response 
being weirdly device-centric - however could we also convert all the 
feasible instances of dev->bus->iommu_ops to dev_iommu_ops() as well? 
(Subtly implying that I'm also not a fan of having "_get" in the name 
for a non-refcounted lookup...) Obviously iommu_probe_device() and 
iommmu_domain_alloc() still need bus ops at this point, but I'm working 
on that... :)


Thanks,
Robin.


return false;
  }
@@ -1251,10 +1254,10 @@ int iommu_page_response(struct device *dev,
struct iommu_fault_event *evt;
struct iommu_fault_page_request *prm;
struct dev_iommu *param = dev->iommu;
+   const struct iommu_ops *ops = dev_iommu_ops_get(dev);
bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
-   struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
  
-	if (!domain || !domain->ops->page_response)

+   if (!ops || !ops->page_response)
return -ENODEV;
  
  	if (!param || !param->fault_param)

@@ -1295,7 +1298,7 @@ int iommu_page_response(struct device *dev,
msg->pasid = 0;
}
  
-		ret = domain->ops->page_response(dev, evt, msg);

+   ret = 

[PATCH 3/3] swiotlb: Simplify array allocation

2022-01-24 Thread Robin Murphy
Prefer kcalloc() to kzalloc(array_size()) for allocating an array.

Signed-off-by: Robin Murphy 
---
 kernel/dma/swiotlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f56d6504903c..44c49c3616fe 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -820,8 +820,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem 
*rmem,
if (!mem)
return -ENOMEM;
 
-   mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
-GFP_KERNEL);
+   mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
if (!mem->slots) {
kfree(mem);
return -ENOMEM;
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/3] swiotlb: Simplify debugfs setup

2022-01-24 Thread Robin Murphy
Debugfs functions are already stubbed out for !CONFIG_DEBUG_FS, so we
can remove most of the #ifdefs, just keeping one to manually optimise
away the initcall when it would do nothing. We can also simplify the
code itself by factoring out the directory creation and realising that
the global io_tlb_default_mem now makes debugfs_dir redundant.

Signed-off-by: Robin Murphy 
---
 kernel/dma/swiotlb.c | 40 ++--
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..bdce89e053bd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -36,9 +36,7 @@
 #include 
 #include 
 #include 
-#ifdef CONFIG_DEBUG_FS
 #include 
-#endif
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 #include 
 #include 
@@ -758,47 +756,29 @@ bool is_swiotlb_active(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+const char *dirname)
 {
+   mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+   if (!mem->nslabs)
+   return;
+
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, >nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, >used);
 }
 
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
 {
-   struct io_tlb_mem *mem = _tlb_default_mem;
-
-   debugfs_dir = debugfs_create_dir("swiotlb", NULL);
-   if (mem->nslabs) {
-   mem->debugfs = debugfs_dir;
-   swiotlb_create_debugfs_files(mem);
-   }
+   swiotlb_create_debugfs_files(_tlb_default_mem, "swiotlb");
return 0;
 }
 
+#ifdef CONFIG_DEBUG_FS
 late_initcall(swiotlb_create_default_debugfs);
-
 #endif
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-   struct io_tlb_mem *mem = rmem->priv;
-
-   mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
-   swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
 struct page *swiotlb_alloc(struct device *dev, size_t size)
 {
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -860,7 +840,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem 
*rmem,
 
rmem->priv = mem;
 
-   rmem_swiotlb_debugfs_init(rmem);
+   swiotlb_create_debugfs_files(mem, rmem->name);
}
 
dev->dma_io_tlb_mem = mem;
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/3] swiotlb: Tidy up includes

2022-01-24 Thread Robin Murphy
SWIOTLB's includes have become a great big mess. Restore some order by
consolidating the random different blocks, sorting alphabetically, and
purging some clearly unnecessary entries - linux/io.h is now included
unconditionally, so need not be duplicated in the restricted DMA pool
case; similarly, linux/io.h subsumes asm/io.h; and by now it's a
mystery why asm/dma.h was ever here at all.

Signed-off-by: Robin Murphy 
---
 kernel/dma/swiotlb.c | 31 +--
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index bdce89e053bd..f56d6504903c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,38 +21,33 @@
 #define pr_fmt(fmt) "software IO TLB: " fmt
 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
-#include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
-#include 
 #include 
 #include 
 #include 
 #include 
 #endif
 
-#include 
-#include 
-
-#include 
-#include 
-#include 
-#include 
-
 #define CREATE_TRACE_POINTS
 #include 
 
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/3] Some minor SWIOTLB cleanups

2022-01-24 Thread Robin Murphy
Hi all,

Here's a little collection of cleanup patches for some annoyances
that have built up while looking at SWIOTLB code recently.

Cheers,
Robin.


Robin Murphy (3):
  swiotlb: Simplify debugfs setup
  swiotlb: Tidy up includes
  swiotlb: Simplify array allocation

 kernel/dma/swiotlb.c | 74 ++--
 1 file changed, 24 insertions(+), 50 deletions(-)

-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3] iommu: Fix potential use-after-free during probe

2022-01-21 Thread Robin Murphy

On 2022-01-21 07:16, Vijayanand Jitta wrote:



On 1/18/2022 9:27 PM, Vijayanand Jitta wrote:



On 1/18/2022 7:19 PM, Robin Murphy wrote:

On 2022-01-12 13:13, Vijayanand Jitta wrote:

Kasan has reported the following use after free on dev->iommu.
when a device probe fails and it is in process of freeing dev->iommu
in dev_iommu_free function, a deferred_probe_work_func runs in parallel
and tries to access dev->iommu->fwspec in of_iommu_configure path thus
causing use after free.

BUG: KASAN: use-after-free in of_iommu_configure+0xb4/0x4a4
Read of size 8 at addr ff87a2f1acb8 by task kworker/u16:2/153

Workqueue: events_unbound deferred_probe_work_func
Call trace:
   dump_backtrace+0x0/0x33c
   show_stack+0x18/0x24
   dump_stack_lvl+0x16c/0x1e0
   print_address_description+0x84/0x39c
   __kasan_report+0x184/0x308
   kasan_report+0x50/0x78
   __asan_load8+0xc0/0xc4
   of_iommu_configure+0xb4/0x4a4
   of_dma_configure_id+0x2fc/0x4d4
   platform_dma_configure+0x40/0x5c
   really_probe+0x1b4/0xb74
   driver_probe_device+0x11c/0x228
   __device_attach_driver+0x14c/0x304
   bus_for_each_drv+0x124/0x1b0
   __device_attach+0x25c/0x334
   device_initial_probe+0x24/0x34
   bus_probe_device+0x78/0x134
   deferred_probe_work_func+0x130/0x1a8
   process_one_work+0x4c8/0x970
   worker_thread+0x5c8/0xaec
   kthread+0x1f8/0x220
   ret_from_fork+0x10/0x18

Allocated by task 1:
   kasan_kmalloc+0xd4/0x114
   __kasan_kmalloc+0x10/0x1c
   kmem_cache_alloc_trace+0xe4/0x3d4
   __iommu_probe_device+0x90/0x394
   probe_iommu_group+0x70/0x9c
   bus_for_each_dev+0x11c/0x19c
   bus_iommu_probe+0xb8/0x7d4
   bus_set_iommu+0xcc/0x13c
   arm_smmu_bus_init+0x44/0x130 [arm_smmu]
   arm_smmu_device_probe+0xb88/0xc54 [arm_smmu]
   platform_drv_probe+0xe4/0x13c
   really_probe+0x2c8/0xb74
   driver_probe_device+0x11c/0x228
   device_driver_attach+0xf0/0x16c
   __driver_attach+0x80/0x320
   bus_for_each_dev+0x11c/0x19c
   driver_attach+0x38/0x48
   bus_add_driver+0x1dc/0x3a4
   driver_register+0x18c/0x244
   __platform_driver_register+0x88/0x9c
   init_module+0x64/0xff4 [arm_smmu]
   do_one_initcall+0x17c/0x2f0
   do_init_module+0xe8/0x378
   load_module+0x3f80/0x4a40
   __se_sys_finit_module+0x1a0/0x1e4
   __arm64_sys_finit_module+0x44/0x58
   el0_svc_common+0x100/0x264
   do_el0_svc+0x38/0xa4
   el0_svc+0x20/0x30
   el0_sync_handler+0x68/0xac
   el0_sync+0x160/0x180

Freed by task 1:
   kasan_set_track+0x4c/0x84
   kasan_set_free_info+0x28/0x4c
   kasan_slab_free+0x120/0x15c
   __kasan_slab_free+0x18/0x28
   slab_free_freelist_hook+0x204/0x2fc
   kfree+0xfc/0x3a4
   __iommu_probe_device+0x284/0x394
   probe_iommu_group+0x70/0x9c
   bus_for_each_dev+0x11c/0x19c
   bus_iommu_probe+0xb8/0x7d4
   bus_set_iommu+0xcc/0x13c
   arm_smmu_bus_init+0x44/0x130 [arm_smmu]
   arm_smmu_device_probe+0xb88/0xc54 [arm_smmu]
   platform_drv_probe+0xe4/0x13c
   really_probe+0x2c8/0xb74
   driver_probe_device+0x11c/0x228
   device_driver_attach+0xf0/0x16c
   __driver_attach+0x80/0x320
   bus_for_each_dev+0x11c/0x19c
   driver_attach+0x38/0x48
   bus_add_driver+0x1dc/0x3a4
   driver_register+0x18c/0x244
   __platform_driver_register+0x88/0x9c
   init_module+0x64/0xff4 [arm_smmu]
   do_one_initcall+0x17c/0x2f0
   do_init_module+0xe8/0x378
   load_module+0x3f80/0x4a40
   __se_sys_finit_module+0x1a0/0x1e4
   __arm64_sys_finit_module+0x44/0x58
   el0_svc_common+0x100/0x264
   do_el0_svc+0x38/0xa4
   el0_svc+0x20/0x30
   el0_sync_handler+0x68/0xac
   el0_sync+0x160/0x180

Fix this by taking device_lock during probe_iommu_group.

Signed-off-by: Vijayanand Jitta 
---
   drivers/iommu/iommu.c | 12 
   1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index dd7863e..261792d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1617,7 +1617,7 @@ static int probe_iommu_group(struct device *dev,
void *data)
   {
   struct list_head *group_list = data;
   struct iommu_group *group;
-    int ret;
+    int ret = 0;
     /* Device is probed already if in a group */
   group = iommu_group_get(dev);
@@ -1626,9 +1626,13 @@ static int probe_iommu_group(struct device
*dev, void *data)
   return 0;
   }
   -    ret = __iommu_probe_device(dev, group_list);
-    if (ret == -ENODEV)
-    ret = 0;
+    ret = device_trylock(dev);
+    if (ret) {


This doesn't seem right - we can't have a non-deterministic situation
where __iommu_probe_device() may or may not be called depending on what
anyone else might be doing with the device at the same time.

I don't fully understand how __iommu_probe_device() and
of_iommu_configure() can be running for the same device at the same
time, but if that's not a race which can be fixed in its own right, then


Thanks for the review comments.

During arm_smmu probe, bus_for_each_dev is called which calls
__iommu_probe_device for each all the devs on that bus.

__iommu_probe_device+0x90/0x394
probe_iommu_gro

Re: Phyr Starter

2022-01-20 Thread Robin Murphy

On 2022-01-20 15:27, Keith Busch wrote:

On Thu, Jan 20, 2022 at 02:56:02PM +0100, Christoph Hellwig wrote:

  - on the input side to dma mapping the bio_vecs (or phyrs) are chained
as bios or whatever the containing structure is.  These already exist
and have infrastructure at least in the block layer
  - on the output side I plan for two options:

1) we have a sane IOMMU and everyting will be coalesced into a
   single dma_range.  This requires setting the block layer
   merge boundary to match the IOMMU page size, but that is
   a very good thing to do anyway.


It doesn't look like IOMMU page sizes are exported, or even necessarily
consistently sized on at least one arch (power).


FWIW POWER does its own thing separate from the IOMMU API. For all the 
regular IOMMU API players, page sizes are published in the iommu_domain 
that the common iommu-dma layer operates on. In fact it already uses 
them to pick chunk sizes for composing large buffer allocations.


Robin.


Re: [PATCH v3] iommu: Fix potential use-after-free during probe

2022-01-18 Thread Robin Murphy

On 2022-01-12 13:13, Vijayanand Jitta wrote:

Kasan has reported the following use after free on dev->iommu.
when a device probe fails and it is in process of freeing dev->iommu
in dev_iommu_free function, a deferred_probe_work_func runs in parallel
and tries to access dev->iommu->fwspec in of_iommu_configure path thus
causing use after free.

BUG: KASAN: use-after-free in of_iommu_configure+0xb4/0x4a4
Read of size 8 at addr ff87a2f1acb8 by task kworker/u16:2/153

Workqueue: events_unbound deferred_probe_work_func
Call trace:
  dump_backtrace+0x0/0x33c
  show_stack+0x18/0x24
  dump_stack_lvl+0x16c/0x1e0
  print_address_description+0x84/0x39c
  __kasan_report+0x184/0x308
  kasan_report+0x50/0x78
  __asan_load8+0xc0/0xc4
  of_iommu_configure+0xb4/0x4a4
  of_dma_configure_id+0x2fc/0x4d4
  platform_dma_configure+0x40/0x5c
  really_probe+0x1b4/0xb74
  driver_probe_device+0x11c/0x228
  __device_attach_driver+0x14c/0x304
  bus_for_each_drv+0x124/0x1b0
  __device_attach+0x25c/0x334
  device_initial_probe+0x24/0x34
  bus_probe_device+0x78/0x134
  deferred_probe_work_func+0x130/0x1a8
  process_one_work+0x4c8/0x970
  worker_thread+0x5c8/0xaec
  kthread+0x1f8/0x220
  ret_from_fork+0x10/0x18

Allocated by task 1:
  kasan_kmalloc+0xd4/0x114
  __kasan_kmalloc+0x10/0x1c
  kmem_cache_alloc_trace+0xe4/0x3d4
  __iommu_probe_device+0x90/0x394
  probe_iommu_group+0x70/0x9c
  bus_for_each_dev+0x11c/0x19c
  bus_iommu_probe+0xb8/0x7d4
  bus_set_iommu+0xcc/0x13c
  arm_smmu_bus_init+0x44/0x130 [arm_smmu]
  arm_smmu_device_probe+0xb88/0xc54 [arm_smmu]
  platform_drv_probe+0xe4/0x13c
  really_probe+0x2c8/0xb74
  driver_probe_device+0x11c/0x228
  device_driver_attach+0xf0/0x16c
  __driver_attach+0x80/0x320
  bus_for_each_dev+0x11c/0x19c
  driver_attach+0x38/0x48
  bus_add_driver+0x1dc/0x3a4
  driver_register+0x18c/0x244
  __platform_driver_register+0x88/0x9c
  init_module+0x64/0xff4 [arm_smmu]
  do_one_initcall+0x17c/0x2f0
  do_init_module+0xe8/0x378
  load_module+0x3f80/0x4a40
  __se_sys_finit_module+0x1a0/0x1e4
  __arm64_sys_finit_module+0x44/0x58
  el0_svc_common+0x100/0x264
  do_el0_svc+0x38/0xa4
  el0_svc+0x20/0x30
  el0_sync_handler+0x68/0xac
  el0_sync+0x160/0x180

Freed by task 1:
  kasan_set_track+0x4c/0x84
  kasan_set_free_info+0x28/0x4c
  kasan_slab_free+0x120/0x15c
  __kasan_slab_free+0x18/0x28
  slab_free_freelist_hook+0x204/0x2fc
  kfree+0xfc/0x3a4
  __iommu_probe_device+0x284/0x394
  probe_iommu_group+0x70/0x9c
  bus_for_each_dev+0x11c/0x19c
  bus_iommu_probe+0xb8/0x7d4
  bus_set_iommu+0xcc/0x13c
  arm_smmu_bus_init+0x44/0x130 [arm_smmu]
  arm_smmu_device_probe+0xb88/0xc54 [arm_smmu]
  platform_drv_probe+0xe4/0x13c
  really_probe+0x2c8/0xb74
  driver_probe_device+0x11c/0x228
  device_driver_attach+0xf0/0x16c
  __driver_attach+0x80/0x320
  bus_for_each_dev+0x11c/0x19c
  driver_attach+0x38/0x48
  bus_add_driver+0x1dc/0x3a4
  driver_register+0x18c/0x244
  __platform_driver_register+0x88/0x9c
  init_module+0x64/0xff4 [arm_smmu]
  do_one_initcall+0x17c/0x2f0
  do_init_module+0xe8/0x378
  load_module+0x3f80/0x4a40
  __se_sys_finit_module+0x1a0/0x1e4
  __arm64_sys_finit_module+0x44/0x58
  el0_svc_common+0x100/0x264
  do_el0_svc+0x38/0xa4
  el0_svc+0x20/0x30
  el0_sync_handler+0x68/0xac
  el0_sync+0x160/0x180

Fix this by taking device_lock during probe_iommu_group.

Signed-off-by: Vijayanand Jitta 
---
  drivers/iommu/iommu.c | 12 
  1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index dd7863e..261792d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1617,7 +1617,7 @@ static int probe_iommu_group(struct device *dev, void 
*data)
  {
struct list_head *group_list = data;
struct iommu_group *group;
-   int ret;
+   int ret = 0;
  
  	/* Device is probed already if in a group */

group = iommu_group_get(dev);
@@ -1626,9 +1626,13 @@ static int probe_iommu_group(struct device *dev, void 
*data)
return 0;
}
  
-	ret = __iommu_probe_device(dev, group_list);

-   if (ret == -ENODEV)
-   ret = 0;
+   ret = device_trylock(dev);
+   if (ret) {


This doesn't seem right - we can't have a non-deterministic situation 
where __iommu_probe_device() may or may not be called depending on what 
anyone else might be doing with the device at the same time.


I don't fully understand how __iommu_probe_device() and 
of_iommu_configure() can be running for the same device at the same 
time, but if that's not a race which can be fixed in its own right, then 
I think adding a refcount to dev_iommu would be a more sensible way to 
mitigate it.


Robin.


+   ret = __iommu_probe_device(dev, group_list);
+   if (ret == -ENODEV)
+   ret = 0;
+   device_unlock(dev);
+   }
  
  	return ret;

  }

___
iommu mailing list
iommu@lists.linux-foundation.org

Re: [PATCH v3 1/2] dt-bindings: arm: xen: document Xen iommu device

2022-01-17 Thread Robin Murphy

On 2022-01-17 12:32, Sergiy Kibrik wrote:

In IOMMU-capable system hypervisor usually takes over IOMMU control.
Generally guest domains don't need to care about IOMMU management and take any
extra actions. Yet in some cases a knowledge about which device is protected
may be useful for privileged domain.

In compliance with iommu bindings this can be achieved with device-level
iommus property specified with dummy Xen iommu device.


This could break Linux guests, since depending on the deferred probe 
timeout setting it could lead to drivers never probing because the 
"IOMMU" never becomes available.


Unless you intend to expose actual paravirtualised IOMMU translation 
functionality to guests (in which case virtio-iommu would be highly 
preferable anyway), I don't think this is the right approach. If there's 
no better alternative to using DT to communicate Xen-specific policy, 
then at least it should logically be via a Xen-specific DT property.


Thanks,
Robin.


Signed-off-by: Sergiy Kibrik 
---
  Documentation/devicetree/bindings/arm/xen.txt | 26 +++
  1 file changed, 26 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/xen.txt 
b/Documentation/devicetree/bindings/arm/xen.txt
index db5c56db30ec..98efa95c0d1b 100644
--- a/Documentation/devicetree/bindings/arm/xen.txt
+++ b/Documentation/devicetree/bindings/arm/xen.txt
@@ -58,3 +58,29 @@ Documentation/arm/uefi.rst, which are provided by the 
regular UEFI stub. However
  they differ because they are provided by the Xen hypervisor, together with a 
set
  of UEFI runtime services implemented via hypercalls, see
  
http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,platform.h.html.
+
+* XEN IOMMU device
+
+In compliance with iommu bindings Xen virtual IOMMU device node represents
+hypervisor-managed IOMMU [1]. Platform devices specified as IOMMU masters of
+this xen-iommu device are protected by hypervisor-managed platform IOMMU.
+
+Required properties:
+
+- compatible:  Should be "xen,iommu-el2-v1"
+- #iommu-cells: must be 0
+
+Example:
+
+xen-iommu {
+   compatible = "xen,iommu-el2-v1";
+   #iommu-cells = <0>;
+};
+
+video@fe001000 {
+   ...
+   /* this platform device is IOMMU-protected by hypervisor */
+   iommus = < 0x0>;
+};
+
+[1] Documentation/devicetree/bindings/iommu/iommu.txt




Re: [PATCH 1/2] drm/panfrost: mmu: improved memory attributes

2021-12-24 Thread Robin Murphy

On 2021-12-23 11:06, asheplya...@basealt.ru wrote:

From: Alexey Sheplyakov 

T62x/T60x GPUs are known to not work with panfrost as of now.
One of the reasons is wrong/incomplete memory attributes which
the panfrost driver sets in the page tables:

- MEMATTR_IMP_DEF should be 0x48ULL, not 0x88ULL.
   0x88ULL is MEMATTR_OUTER_IMP_DEF


I guess the macro could be renamed if anyone's particularly bothered, 
but using the outer-cacheable attribute is deliberate because it is 
necessary for I/O-coherent GPUs to work properly (and should be 
irrelevant for non-coherent integrations). I'm away from my Juno board 
for the next couple of weeks but I'm fairly confident that this change 
as-is will break cache snooping.



- MEMATTR_FORCE_CACHE_ALL and MEMATTR_OUTER_WA are missing.


They're "missing" because they're not used, and there's currently no 
mechanism by which they *could* be used. Also note that the indices in 
MEMATTR have to line up with the euqivalent LPAE indices for the closest 
match to what the IOMMU API's IOMMU_{CACHE,MMIO} flags represent, so 
moving those around is yet more subtle breakage.



T72x and newer GPUs work just fine with such incomplete/wrong memory
attributes. However T62x are quite picky and quickly lock up.

To avoid the problem set the same memory attributes (in LPAE page
tables) as mali_kbase.

The patch has been tested (for regressions) with T860 GPU (rk3399 SoC).
At the first glance (using GNOME desktop, running glmark) it does
not cause any changes for this GPU.

Note: this patch is necessary, but *not* enough to get panfrost
working with T62x


I'd note that panfrost has been working OK - to the extent that Mesa 
supports its older ISA - on the T624 (single core group) in Arm's Juno 
SoC for over a year now since commit 268af50f38b1.


If you have to force outer non-cacheable to avoid getting translation 
faults and other errors that look like the GPU is inexplicably seeing 
the wrong data, I'd check whether you have the same thing where your 
integration is actually I/O-coherent and you're missing the 
"dma-coherent" property in your DT.


Thanks,
Robin.


Signed-off-by: Alexey Sheplyakov 
Signed-off-by: Vadim V. Vlasov 

Cc: Rob Herring 
Cc: Tomeu Vizoso 
Cc: Steven Price 
Cc: Alyssa Rosenzweig 
Cc: Vadim V. Vlasov 
---
  drivers/gpu/drm/panfrost/panfrost_mmu.c |  3 ---
  drivers/iommu/io-pgtable-arm.c  | 16 
  2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c 
b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 39562f2d11a4..2f4f8a17bc82 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c
@@ -133,9 +133,6 @@ static void panfrost_mmu_enable(struct panfrost_device 
*pfdev, struct panfrost_m
mmu_write(pfdev, AS_TRANSTAB_LO(as_nr), lower_32_bits(transtab));
mmu_write(pfdev, AS_TRANSTAB_HI(as_nr), upper_32_bits(transtab));
  
-	/* Need to revisit mem attrs.

-* NC is the default, Mali driver is inner WT.
-*/
mmu_write(pfdev, AS_MEMATTR_LO(as_nr), lower_32_bits(memattr));
mmu_write(pfdev, AS_MEMATTR_HI(as_nr), upper_32_bits(memattr));
  
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c

index dd9e47189d0d..15b39c337e20 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -122,13 +122,17 @@
  #define ARM_LPAE_MAIR_ATTR_IDX_CACHE  1
  #define ARM_LPAE_MAIR_ATTR_IDX_DEV2
  #define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE 3
+#define ARM_LPAE_MAIR_ATTR_IDX_OUTER_WA 4
  
  #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)

  #define ARM_MALI_LPAE_TTBR_READ_INNER BIT(2)
  #define ARM_MALI_LPAE_TTBR_SHARE_OUTERBIT(4)
  
-#define ARM_MALI_LPAE_MEMATTR_IMP_DEF	0x88ULL

-#define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x8DULL
+#define ARM_MALI_LPAE_MEMATTR_IMP_DEF  0x48ULL
+#define ARM_MALI_LPAE_MEMATTR_FORCE_CACHE_ALL 0x4FULL
+#define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x4DULL
+#define ARM_MALI_LPAE_MEMATTR_OUTER_IMP_DEF 0x88ULL
+#define ARM_MALI_LPAE_MEMATTR_OUTER_WA 0x8DULL
  
  #define APPLE_DART_PTE_PROT_NO_WRITE (1<<7)

  #define APPLE_DART_PTE_PROT_NO_READ (1<<8)
@@ -1080,10 +1084,14 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, 
void *cookie)
cfg->arm_mali_lpae_cfg.memattr =
(ARM_MALI_LPAE_MEMATTR_IMP_DEF
 << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
+   (ARM_MALI_LPAE_MEMATTR_FORCE_CACHE_ALL
+<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
(ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC
 << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
-   (ARM_MALI_LPAE_MEMATTR_IMP_DEF
-<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+   (ARM_MALI_LPAE_MEMATTR_OUTER_IMP_DEF
+<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+   

Re: [PATCH v3 5/5] iommu/nvidia-grace-cmdqv: Limit CMDs for guest owned VINTF

2021-12-24 Thread Robin Murphy

On 2021-12-24 08:02, Nicolin Chen wrote:

On Thu, Dec 23, 2021 at 11:14:17AM +, Robin Murphy wrote:

External email: Use caution opening links or attachments


On 2021-12-22 22:52, Nicolin Chen wrote:

On Wed, Dec 22, 2021 at 12:32:29PM +, Robin Murphy wrote:

External email: Use caution opening links or attachments


On 2021-11-19 07:19, Nicolin Chen via iommu wrote:

When VCMDQs are assigned to a VINTF that is owned by a guest, not
hypervisor (HYP_OWN bit is unset), only TLB invalidation commands
are supported. This requires get_cmd() function to scan the input
cmd before selecting cmdq between smmu->cmdq and vintf->vcmdq, so
unsupported commands can still go through emulated smmu->cmdq.

Also the guest shouldn't have HYP_OWN bit being set regardless of
guest kernel driver writing it or not, i.e. the user space driver
running in the host OS should wire this bit to zero when trapping
a write access to this VINTF_CONFIG register from a guest kernel.
So instead of using the existing regval, this patch reads out the
register value explicitly to cache in vintf->cfg.

Signed-off-by: Nicolin Chen 
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  6 ++--
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 +--
.../arm/arm-smmu-v3/nvidia-grace-cmdqv.c  | 32 +--
3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b1182dd825fd..73941ccc1a3e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -337,10 +337,10 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
return 0;
}

-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu, 
u64 *cmds, int n)
{
if (smmu->nvidia_grace_cmdqv)
- return nvidia_grace_cmdqv_get_cmdq(smmu);
+ return nvidia_grace_cmdqv_get_cmdq(smmu, cmds, n);

return >cmdq;
}
@@ -747,7 +747,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
u32 prod;
unsigned long flags;
bool owner;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
struct arm_smmu_ll_queue llq, head;
int ret = 0;

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 24f93444aeeb..085c775c2eea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -832,7 +832,8 @@ struct nvidia_grace_cmdqv *
nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
  struct acpi_iort_node *node);
int nvidia_grace_cmdqv_device_reset(struct arm_smmu_device *smmu);
-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device 
*smmu);
+struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+   u64 *cmds, int n);
#else /* CONFIG_NVIDIA_GRACE_CMDQV */
static inline struct nvidia_grace_cmdqv *
nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
@@ -847,7 +848,7 @@ static inline int nvidia_grace_cmdqv_device_reset(struct 
arm_smmu_device *smmu)
}

static inline struct arm_smmu_cmdq *
-nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
{
return NULL;
}
diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c 
b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
index c0d7351f13e2..71f6bc684e64 100644
--- a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
@@ -166,7 +166,8 @@ static int nvidia_grace_cmdqv_init_one_vcmdq(struct 
nvidia_grace_cmdqv *cmdqv,
return arm_smmu_cmdq_init(cmdqv->smmu, cmdq);
}

-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+struct arm_smmu_cmdq *
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
{
struct nvidia_grace_cmdqv *cmdqv = smmu->nvidia_grace_cmdqv;
struct nvidia_grace_cmdqv_vintf *vintf0 = >vintf0;
@@ -176,6 +177,24 @@ struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct 
arm_smmu_device *smmu)
if (!FIELD_GET(VINTF_STATUS, vintf0->status))
return >cmdq;

+ /* Check for supported CMDs if VINTF is owned by guest (not hypervisor) */
+ if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) {
+ u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : 
CMDQ_OP_CMD_SYNC;


I'm not sure there was ever a conscious design decision that batches
only ever contain one type of command - if something needs to start


Hmm, I thin

Re: [PATCH 2/2] iommu/arm-smmu: Propagate errors from platform_get_irq()

2021-12-23 Thread Robin Murphy

On 2021-12-23 13:00, Lad Prabhakar wrote:

The driver overrides the error code returned by platform_get_irq() to
-ENODEV. Switch to propagating the error code upstream so that errors
such as -EPROBE_DEFER are handled.


I wouldn't usually expect an SMMU to be wired up to a secondary 
interrupt controller that could cause deferral, but on the other hand I 
don't think there's any good reason *not* to propagate the original 
error anyway, so sure, why not.


Reviewed-by: Robin Murphy 


Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq")
Signed-off-by: Lad Prabhakar 
---
  drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4844cd075644..6cf5612efcda 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2129,7 +2129,7 @@ static int arm_smmu_device_probe(struct platform_device 
*pdev)
int irq = platform_get_irq(pdev, i);
  
  		if (irq < 0)

-   return -ENODEV;
+   return irq;
smmu->irqs[i] = irq;
}
  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/2] iommu/arm-smmu: Use platform_irq_count() to get the interrupt count

2021-12-23 Thread Robin Murphy

On 2021-12-23 13:00, Lad Prabhakar wrote:

platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static
allocation of IRQ resources in DT core code, this causes an issue
when using hierarchical interrupt domains using "interrupts" property
in the node as this bypasses the hierarchical setup and messes up the
irq chaining.

In preparation for removal of static setup of IRQ resource from DT core
code use platform_get_irq_count().


Nit: platform_irq_count()


Signed-off-by: Lad Prabhakar 
---
  drivers/iommu/arm/arm-smmu/arm-smmu.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4bc75c4ce402..4844cd075644 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2105,12 +2105,12 @@ static int arm_smmu_device_probe(struct platform_device 
*pdev)
if (IS_ERR(smmu))
return PTR_ERR(smmu);
  
-	num_irqs = 0;

-   while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
-   num_irqs++;
-   if (num_irqs > smmu->num_global_irqs)
-   smmu->num_context_irqs++;
-   }
+   num_irqs = platform_irq_count(pdev);
+   if (num_irqs < 0)
+   return num_irqs;
+
+   if (num_irqs > smmu->num_global_irqs)
+   smmu->num_context_irqs += (num_irqs - smmu->num_global_irqs);


This seems a bit overcomplicated. I reckon:

smmu->num_context_irqs = num_irqs - smmu->num_global_irqs;
if (num_irqs <= smmu->num_global_irqs) {
dev_err(...

should do it.

However, FYI I have some patches refactoring most of the IRQ stuff here 
that I plan to post next cycle (didn't quite have time to get them done 
for 5.17 as I'd hoped...), so unless this needs to go in right now as an 
urgent fix, I'm happy to take care of removing platform_get_resource() 
as part of that if it's easier.


Thanks,
Robin.

  
  	if (!smmu->num_context_irqs) {

dev_err(dev, "found %d interrupts but expected at least %d\n",

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 5/5] iommu/nvidia-grace-cmdqv: Limit CMDs for guest owned VINTF

2021-12-23 Thread Robin Murphy

On 2021-12-22 22:52, Nicolin Chen wrote:

On Wed, Dec 22, 2021 at 12:32:29PM +, Robin Murphy wrote:

External email: Use caution opening links or attachments


On 2021-11-19 07:19, Nicolin Chen via iommu wrote:

When VCMDQs are assigned to a VINTF that is owned by a guest, not
hypervisor (HYP_OWN bit is unset), only TLB invalidation commands
are supported. This requires get_cmd() function to scan the input
cmd before selecting cmdq between smmu->cmdq and vintf->vcmdq, so
unsupported commands can still go through emulated smmu->cmdq.

Also the guest shouldn't have HYP_OWN bit being set regardless of
guest kernel driver writing it or not, i.e. the user space driver
running in the host OS should wire this bit to zero when trapping
a write access to this VINTF_CONFIG register from a guest kernel.
So instead of using the existing regval, this patch reads out the
register value explicitly to cache in vintf->cfg.

Signed-off-by: Nicolin Chen 
---
   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  6 ++--
   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 +--
   .../arm/arm-smmu-v3/nvidia-grace-cmdqv.c  | 32 +--
   3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b1182dd825fd..73941ccc1a3e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -337,10 +337,10 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
   return 0;
   }

-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu, 
u64 *cmds, int n)
   {
   if (smmu->nvidia_grace_cmdqv)
- return nvidia_grace_cmdqv_get_cmdq(smmu);
+ return nvidia_grace_cmdqv_get_cmdq(smmu, cmds, n);

   return >cmdq;
   }
@@ -747,7 +747,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
   u32 prod;
   unsigned long flags;
   bool owner;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
+ struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
   struct arm_smmu_ll_queue llq, head;
   int ret = 0;

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 24f93444aeeb..085c775c2eea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -832,7 +832,8 @@ struct nvidia_grace_cmdqv *
   nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
 struct acpi_iort_node *node);
   int nvidia_grace_cmdqv_device_reset(struct arm_smmu_device *smmu);
-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device 
*smmu);
+struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+   u64 *cmds, int n);
   #else /* CONFIG_NVIDIA_GRACE_CMDQV */
   static inline struct nvidia_grace_cmdqv *
   nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
@@ -847,7 +848,7 @@ static inline int nvidia_grace_cmdqv_device_reset(struct 
arm_smmu_device *smmu)
   }

   static inline struct arm_smmu_cmdq *
-nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
   {
   return NULL;
   }
diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c 
b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
index c0d7351f13e2..71f6bc684e64 100644
--- a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
@@ -166,7 +166,8 @@ static int nvidia_grace_cmdqv_init_one_vcmdq(struct 
nvidia_grace_cmdqv *cmdqv,
   return arm_smmu_cmdq_init(cmdqv->smmu, cmdq);
   }

-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+struct arm_smmu_cmdq *
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
   {
   struct nvidia_grace_cmdqv *cmdqv = smmu->nvidia_grace_cmdqv;
   struct nvidia_grace_cmdqv_vintf *vintf0 = >vintf0;
@@ -176,6 +177,24 @@ struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct 
arm_smmu_device *smmu)
   if (!FIELD_GET(VINTF_STATUS, vintf0->status))
   return >cmdq;

+ /* Check for supported CMDs if VINTF is owned by guest (not hypervisor) */
+ if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) {
+ u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : 
CMDQ_OP_CMD_SYNC;


I'm not sure there was ever a conscious design decision that batches
only ever contain one type of command - if something needs to start


Hmm, I think that's a good catch -- as it could be a potential
bug here. Though the SMMUv3 driver currently seems to use loop
by adding one type of cmds to any batch and submitting it right
away so ch

Re: [PATCH v4 07/13] iommu: Add iommu_at[de]tach_device_shared() for multi-device groups

2021-12-22 Thread Robin Murphy

On 21/12/2021 6:46 pm, Jason Gunthorpe wrote:

On Tue, Dec 21, 2021 at 04:50:56PM +, Robin Murphy wrote:


this proposal is the worst of both worlds, in that drivers still have to be
just as aware of groups in order to know whether to call the _shared
interface or not, except it's now entirely implicit and non-obvious.


Drivers are not aware of groups, where did you see that?


`git grep iommu_attach_group -- :^drivers/iommu :^include`

Did I really have to explain that?

The drivers other than vfio_iommu_type1, however, do have a complete 
failure to handle, or even consider, any group that does not fit the 
particular set of assumptions they are making, but at least they only 
work in a context where that should not occur.



Drivers have to indicate their intention, based entirely on their own
internal design. If groups are present, or not is irrelevant to the
driver.

If the driver uses a single struct device (which is most) then it uses
iommu_attach_device().

If the driver uses multiple struct devices and intends to connect them
all to the same domain then it uses the _shared variant. The only
difference between the two is the _shared varient lacks some of the
protections against driver abuse of the API.


You've lost me again; how are those intentions any different? Attaching 
one device to a private domain is a literal subset of attaching more 
than one device to a private domain. There is no "abuse" of any API 
anywhere; the singleton group restriction exists as a protective measure 
because iommu_attach_device() was already in use before groups were 
really a thing, in contexts where groups happened to be singleton 
already, but anyone adding *new* uses in contexts where that assumption 
might *not* hold would be in trouble. Thus it enforces DMA ownership by 
the most trivial and heavy-handed means of simply preventing it ever 
becoming shared in the first place.


Yes, I'm using the term "DMA ownership" in a slightly different context 
to the one in which you originally proposed it. Please step out of the 
userspace-device-assignment-focused bubble for a moment and stay with me...


So then we have the iommu_attach_group() interface for new code (and 
still nobody has got round to updating the old code to it yet), for 
which the basic use-case is still fundamentally "I want to attach my 
thing to my domain", but at least now forcing explicit awareness that 
"my thing" could possibly be inextricably intertwined with more than 
just the one device they expect, so potential callers should have a good 
think about that. Unfortunately this leaves the matter of who "owns" the 
group entirely in the hands of those callers, which as we've now 
concluded is not great.


One of the main reasons for non-singleton groups to occur is due to ID 
aliasing or lack of isolation well beyond the scope and control of 
endpoint devices themselves, so it's not really fair to expect every 
IOMMU-aware driver to also be aware of that, have any idea of how to 
actually handle it, or especially try to negotiate with random other 
drivers as to whether it might be OK to take control of their DMA 
address space too. The whole point is that *every* domain attach really 
*has* to be considered "shared" because in general drivers can't know 
otherwise. Hence the easy, if crude, fix for the original API.



Nothing uses the group interface except for VFIO and stuff inside
drivers/iommu. VFIO has a uAPI tied to the group interface and it
is stuck with it.


Self-contradiction is getting stronger, careful...

Otherwise just add the housekeeping stuff to iommu_{attach,detach}_group() -
there's no way we want *three* attach/detach interfaces all with different
semantics.


I'm not sure why you think 3 APIs is bad thing. Threes APIs, with
clearly intended purposes is a lot better than one giant API with a
bunch of parameters that tries to do everything.


Because there's only one problem to solve! We have the original API 
which does happen to safely enforce ownership, but in an implicit way 
that doesn't scale; then we have the second API which got past the 
topology constraint but unfortunately turns out to just be unsafe in a 
slightly different way, and was supposed to replace the first one but 
hasn't, and is a bit clunky to boot; now you're proposing a third one 
which can correctly enforce safe ownership for any group topology, which 
is simply combining the good bits of the first two. It makes no sense to 
maintain two bad versions of a thing alongside one which works better.


I don't see why anything would be a giant API with a bunch of parameters 
- depending on how you look at it, this new proposal is basically either 
iommu_attach_device() with the ability to scale up to non-trivial groups 
properly, or iommu_attach_group() with a potentially better interface 
and actual safety. The former is still more prevalent (and the interface 
argu

Re: [PATCH v3 5/5] iommu/nvidia-grace-cmdqv: Limit CMDs for guest owned VINTF

2021-12-22 Thread Robin Murphy

On 2021-11-19 07:19, Nicolin Chen via iommu wrote:

When VCMDQs are assigned to a VINTF that is owned by a guest, not
hypervisor (HYP_OWN bit is unset), only TLB invalidation commands
are supported. This requires get_cmd() function to scan the input
cmd before selecting cmdq between smmu->cmdq and vintf->vcmdq, so
unsupported commands can still go through emulated smmu->cmdq.

Also the guest shouldn't have HYP_OWN bit being set regardless of
guest kernel driver writing it or not, i.e. the user space driver
running in the host OS should wire this bit to zero when trapping
a write access to this VINTF_CONFIG register from a guest kernel.
So instead of using the existing regval, this patch reads out the
register value explicitly to cache in vintf->cfg.

Signed-off-by: Nicolin Chen 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  6 ++--
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 +--
  .../arm/arm-smmu-v3/nvidia-grace-cmdqv.c  | 32 +--
  3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b1182dd825fd..73941ccc1a3e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -337,10 +337,10 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
return 0;
  }
  
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)

+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu, 
u64 *cmds, int n)
  {
if (smmu->nvidia_grace_cmdqv)
-   return nvidia_grace_cmdqv_get_cmdq(smmu);
+   return nvidia_grace_cmdqv_get_cmdq(smmu, cmds, n);
  
  	return >cmdq;

  }
@@ -747,7 +747,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct 
arm_smmu_device *smmu,
u32 prod;
unsigned long flags;
bool owner;
-   struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
+   struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n);
struct arm_smmu_ll_queue llq, head;
int ret = 0;
  
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h

index 24f93444aeeb..085c775c2eea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -832,7 +832,8 @@ struct nvidia_grace_cmdqv *
  nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
  struct acpi_iort_node *node);
  int nvidia_grace_cmdqv_device_reset(struct arm_smmu_device *smmu);
-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device 
*smmu);
+struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+ u64 *cmds, int n);
  #else /* CONFIG_NVIDIA_GRACE_CMDQV */
  static inline struct nvidia_grace_cmdqv *
  nvidia_grace_cmdqv_acpi_probe(struct arm_smmu_device *smmu,
@@ -847,7 +848,7 @@ static inline int nvidia_grace_cmdqv_device_reset(struct 
arm_smmu_device *smmu)
  }
  
  static inline struct arm_smmu_cmdq *

-nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
  {
return NULL;
  }
diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c 
b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
index c0d7351f13e2..71f6bc684e64 100644
--- a/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-grace-cmdqv.c
@@ -166,7 +166,8 @@ static int nvidia_grace_cmdqv_init_one_vcmdq(struct 
nvidia_grace_cmdqv *cmdqv,
return arm_smmu_cmdq_init(cmdqv->smmu, cmdq);
  }
  
-struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)

+struct arm_smmu_cmdq *
+nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu, u64 *cmds, int n)
  {
struct nvidia_grace_cmdqv *cmdqv = smmu->nvidia_grace_cmdqv;
struct nvidia_grace_cmdqv_vintf *vintf0 = >vintf0;
@@ -176,6 +177,24 @@ struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct 
arm_smmu_device *smmu)
if (!FIELD_GET(VINTF_STATUS, vintf0->status))
return >cmdq;
  
+	/* Check for supported CMDs if VINTF is owned by guest (not hypervisor) */

+   if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) {
+   u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : 
CMDQ_OP_CMD_SYNC;


I'm not sure there was ever a conscious design decision that batches 
only ever contain one type of command - if something needs to start 
depending on that behaviour then that dependency probably wants to be 
clearly documented. Also, a sync on its own gets trapped to the main 
cmdq but a sync on the end of a batch of TLBIs or ATCIs goes to the 
VCMDQ, huh?



+
+   /* List all supported CMDs for vintf->cmdq pathway */
+   switch (opcode) {
+   case CMDQ_OP_TLBI_NH_ASID:
+   case 

Re: [PATCH v3 4/5] iommu/arm-smmu-v3: Add host support for NVIDIA Grace CMDQ-V

2021-12-22 Thread Robin Murphy

On 2021-12-21 22:00, Nicolin Chen wrote:
[...]

The challenge to make ECMDQ useful to Linux is how to make sure that all
the commands expected to be within scope of a future CMND_SYNC plus that
sync itself all get issued on the same queue, so I'd be mildly surprised
if you didn't have the same problem.


PATCH-3 in this series actually helps align the command queues,
between issued commands and SYNC, if bool sync == true. Yet, if
doing something like issue->issue->issue_with_sync, it could be
tricker.


Indeed between the iommu_iotlb_gather mechanism and low-level command
batching things are already a lot more concentrated than they could be,
but arm_smmu_cmdq_batch_add() and its callers stand out as examples of
where we'd still be vulnerable to preemption. What I haven't even tried
to reason about yet is assumptions in the higher-level APIs, e.g. if
io-pgtable might chuck out a TLBI during an iommu_unmap() which we
implicitly expect a later iommu_iotlb_sync() to cover.


Though I might have oversimplified the situation here, I see
the arm_smmu_cmdq_batch_add() calls are typically followed by
arm_smmu_cmdq_batch_submit(). Could we just add a SYNC in the
_batch_submit() to all the queues that it previously touched
in the _batch_add()?


Keeping track of which queues a batch has touched is certainly possible, 
but it's yet more overhead to impose across the board when intra-batch 
preemption should (hopefully) be very rare in practice. I was thinking 
more along the lines of disabling preemption/migration for the lifetime 
of a batch, or more pragmatically just hoisting the queue selection all 
the way out to the scope of the batch itself (which also conveniently 
seems about the right shape for potentially forking off a whole other 
dedicated command submission flow from that point later).


We still can't mitigate inter-batch preemption, though, so we'll just 
have to audit everything very carefully to make sure we don't have (or 
inadvertently introduce in future) any places where that could be 
problematic. We really want to avoid over-syncing as that's liable to 
end up being just as bad for performance as the contention that we're 
nominally avoiding.



I've been thinking that in many ways per-domain queues make quite a bit
of sense and would be easier to manage than per-CPU ones - plus that's
pretty much the usage model once we get to VMs anyway - but that fails
to help the significant cases like networking and storage where many
CPUs are servicing a big monolithic device in a single domain :(


Yea, and it's hard to assume which client would use CMDQ more
frequently, in order to balance or assign more queues to that
client, which feels like a QoS conundrum.


Indeed, plus once we start assigning queues to VMs we're going to want 
to remove them from the general pool for host usage, so we definitely 
want to plan ahead here.


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] Swiotlb: remove a duplicate include

2021-12-22 Thread Robin Murphy

On 2021-12-22 02:54, Guo Zhengkui wrote:

Remove a duplicate "#include ". The deleted one in line 43
is under "#ifdef CONFIG_DMA_RESTRICTED_POOL". However, there is already
one in line 53 with no conditional compile.

Signed-off-by: Guo Zhengkui 
---
  kernel/dma/swiotlb.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1b0501fd3e0e..8c091626ca35 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -33,21 +33,20 @@
  #include 
  #include 
  #include 
  #include 
  #include 
  #include 
  #ifdef CONFIG_DEBUG_FS
  #include 
  #endif
  #ifdef CONFIG_DMA_RESTRICTED_POOL
-#include 
  #include 
  #include 
  #include 
  #include 
  #endif
  
  #include 


By the same token we don't need this one either - as a general rule, 
linux/* headers can be assumed to include their asm/* equivalent, and 
that is certainly true for io.h.



  #include 
  
  #include 


TBH, now that I'm looking, the whole lot is a mess and I'm sure there's 
more legacy cruft that doesn't need to be here. If I remember, I might 
have a crack at a cleanup once rc1 is out.


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 4/5] iommu/arm-smmu-v3: Add host support for NVIDIA Grace CMDQ-V

2021-12-21 Thread Robin Murphy

On 2021-12-20 19:27, Nicolin Chen wrote:

Hi Robin,

Thank you for the reply!

On Mon, Dec 20, 2021 at 06:42:26PM +, Robin Murphy wrote:

On 2021-11-19 07:19, Nicolin Chen wrote:

From: Nate Watterson 

NVIDIA's Grace Soc has a CMDQ-Virtualization (CMDQV) hardware,
which extends the standard ARM SMMU v3 IP to support multiple
VCMDQs with virtualization capabilities. In-kernel of host OS,
they're used to reduce contention on a single queue. In terms
of command queue, they are very like the standard CMDQ/ECMDQs,
but only support CS_NONE in the CS field of CMD_SYNC command.

This patch adds a new nvidia-grace-cmdqv file and inserts its
structure pointer into the existing arm_smmu_device, and then
adds related function calls in the arm-smmu-v3 driver.

In the CMDQV driver itself, this patch only adds minimal part
for host kernel support. Upon probe(), VINTF0 is reserved for
in-kernel use. And some of the VCMDQs are assigned to VINTF0.
Then the driver will select one of VCMDQs in the VINTF0 based
on the CPU currently executing, to issue commands.


Is there a tangible difference to DMA API or VFIO performance?


Our testing environment is currently running on a single-core
CPU, so unfortunately we don't have a perf data at this point.


OK, as for the ECMDQ patches I think we'll need some investigation with 
real workloads to judge whether we can benefit from these things enough 
to justify the complexity, and whether the design is right.


My gut feeling is that if these multi-queue schemes really can live up 
to their promise of making contention negligible, then they should 
further stand to benefit from bypassing the complex lock-free command 
batching in favour of something more lightweight, which could change the 
direction of much of the refactoring.



[...]

+struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+{
+ struct nvidia_grace_cmdqv *cmdqv = smmu->nvidia_grace_cmdqv;
+ struct nvidia_grace_cmdqv_vintf *vintf0 = >vintf0;
+ u16 qidx;
+
+ /* Check error status of vintf0 */
+ if (!FIELD_GET(VINTF_STATUS, vintf0->status))
+ return >cmdq;
+
+ /*
+  * Select a vcmdq to use. Here we use a temporal solution to
+  * balance out traffic on cmdq issuing: each cmdq has its own
+  * lock, if all cpus issue cmdlist using the same cmdq, only
+  * one CPU at a time can enter the process, while the others
+  * will be spinning at the same lock.
+  */
+ qidx = smp_processor_id() % cmdqv->num_vcmdqs_per_vintf;


How does ordering work between queues? Do they follow a global order
such that a sync on any queue is guaranteed to complete all prior
commands on all queues?


CMDQV internal scheduler would insert a SYNC when (for example)
switching from VCMDQ0 to VCMDQ1 while last command in VCMDQ0 is
not SYNC. HW has a configuration bit in the register to disable
this feature, which is by default enabled.


Interesting, thanks. So it sounds like this is something you can get 
away with for the moment, but may need to revisit once people chasing 
real-world performance start wanting to turn that bit off.



The challenge to make ECMDQ useful to Linux is how to make sure that all
the commands expected to be within scope of a future CMND_SYNC plus that
sync itself all get issued on the same queue, so I'd be mildly surprised
if you didn't have the same problem.


PATCH-3 in this series actually helps align the command queues,
between issued commands and SYNC, if bool sync == true. Yet, if
doing something like issue->issue->issue_with_sync, it could be
tricker.


Indeed between the iommu_iotlb_gather mechanism and low-level command 
batching things are already a lot more concentrated than they could be, 
but arm_smmu_cmdq_batch_add() and its callers stand out as examples of 
where we'd still be vulnerable to preemption. What I haven't even tried 
to reason about yet is assumptions in the higher-level APIs, e.g. if 
io-pgtable might chuck out a TLBI during an iommu_unmap() which we 
implicitly expect a later iommu_iotlb_sync() to cover.


I've been thinking that in many ways per-domain queues make quite a bit 
of sense and would be easier to manage than per-CPU ones - plus that's 
pretty much the usage model once we get to VMs anyway - but that fails 
to help the significant cases like networking and storage where many 
CPUs are servicing a big monolithic device in a single domain :(


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 00/33] Separate struct slab from struct page

2021-12-21 Thread Robin Murphy

On 2021-12-20 23:58, Vlastimil Babka wrote:

On 12/16/21 16:00, Hyeonggon Yoo wrote:

On Tue, Dec 14, 2021 at 01:57:22PM +0100, Vlastimil Babka wrote:

On 12/1/21 19:14, Vlastimil Babka wrote:

Folks from non-slab subsystems are Cc'd only to patches affecting them, and
this cover letter.

Series also available in git, based on 5.16-rc3:
https://git.kernel.org/pub/scm/linux/kernel/git/vbabka/linux.git/log/?h=slab-struct_slab-v2r2


Pushed a new branch slab-struct-slab-v3r3 with accumulated fixes and small 
tweaks
and a new patch from Hyeonggon Yoo on top. To avoid too much spam, here's a 
range diff:


Reviewing the whole patch series takes longer than I thought.
I'll try to review and test rest of patches when I have time.

I added Tested-by if kernel builds okay and kselftests
does not break the kernel on my machine.
(with CONFIG_SLAB/SLUB/SLOB depending on the patch),


Thanks!


Let me know me if you know better way to test a patch.


Testing on your machine is just fine.


# mm/slub: Define struct slab fields for CONFIG_SLUB_CPU_PARTIAL only when 
enabled

Reviewed-by: Hyeonggon Yoo <42.hye...@gmail.com>
Tested-by: Hyeonggon Yoo <42.hye...@gmail.com>

Comment:
Works on both SLUB_CPU_PARTIAL and !SLUB_CPU_PARTIAL.
btw, do we need slabs_cpu_partial attribute when we don't use
cpu partials? (!SLUB_CPU_PARTIAL)


The sysfs attribute? Yeah we should be consistent to userspace expecting to
read it (even with zeroes), regardless of config.


# mm/slub: Simplify struct slab slabs field definition
Comment:

This is how struct page looks on the top of v3r3 branch:
struct page {
[...]
 struct {/* slab, slob and slub */
 union {
 struct list_head slab_list;
 struct {/* Partial pages */
 struct page *next;
#ifdef CONFIG_64BIT
 int pages;  /* Nr of pages left */
#else
 short int pages;
#endif
 };
 };
[...]

It's not consistent with struct slab.


Hm right. But as we don't actually use the struct page version anymore, and
it's not one of the fields checked by SLAB_MATCH(), we can ignore this.


I think this is because "mm: Remove slab from struct page" was dropped.


That was just postponed until iommu changes are in. Matthew mentioned those
might be merged too, so that final cleanup will happen too and take care of
the discrepancy above, so no need for extra churn to address it speficially.


FYI the IOMMU changes are now queued in linux-next, so if all goes well 
you might be able to sneak that final patch in too.


Robin.




Would you update some of patches?

# mm/sl*b: Differentiate struct slab fields by sl*b implementations
Reviewed-by: Hyeonggon Yoo <42.hye...@gmail.com>
Tested-by: Hyeonggon Yoo <42.hye...@gmail.com>
Works SL[AUO]B on my machine and makes code much better.

# mm/slob: Convert SLOB to use struct slab and struct folio
Reviewed-by: Hyeonggon Yoo <42.hye...@gmail.com>
Tested-by: Hyeonggon Yoo <42.hye...@gmail.com>
It still works fine on SLOB.

# mm/slab: Convert kmem_getpages() and kmem_freepages() to struct slab
Reviewed-by: Hyeonggon Yoo <42.hye...@gmail.com>
Tested-by: Hyeonggon Yoo <42.hye...@gmail.com>

# mm/slub: Convert __free_slab() to use struct slab
Reviewed-by: Hyeonggon Yoo <42.hye...@gmail.com>
Tested-by: Hyeonggon Yoo <42.hye...@gmail.com>

Thanks,
Hyeonggon.


Thanks again,
Vlastimil
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 07/13] iommu: Add iommu_at[de]tach_device_shared() for multi-device groups

2021-12-21 Thread Robin Murphy

On 2021-12-17 06:37, Lu Baolu wrote:

The iommu_attach/detach_device() interfaces were exposed for the device
drivers to attach/detach their own domains. The commit <426a273834eae>
("iommu: Limit iommu_attach/detach_device to device with their own group")
restricted them to singleton groups to avoid different device in a group
attaching different domain.

As we've introduced device DMA ownership into the iommu core. We can now
introduce interfaces for muliple-device groups, and "all devices are in the
same address space" is still guaranteed.

The iommu_attach/detach_device_shared() could be used when multiple drivers
sharing the group claim the DMA_OWNER_PRIVATE_DOMAIN ownership. The first
call of iommu_attach_device_shared() attaches the domain to the group.
Other drivers could join it later. The domain will be detached from the
group after all drivers unjoin it.


I don't see the point of this at all - if you really want to hide the 
concept of IOMMU groups away from drivers then just make 
iommu_{attach,detach}_device() do the right thing. At least the 
iommu_group_get_for_dev() plus iommu_{attach,detach}_group() API is 
clear - this proposal is the worst of both worlds, in that drivers still 
have to be just as aware of groups in order to know whether to call the 
_shared interface or not, except it's now entirely implicit and non-obvious.


Otherwise just add the housekeeping stuff to 
iommu_{attach,detach}_group() - there's no way we want *three* 
attach/detach interfaces all with different semantics.


It's worth taking a step back and realising that overall, this is really 
just a more generalised and finer-grained extension of what 426a273834ea 
already did for non-group-aware code, so it makes little sense *not* to 
integrate it into the existing interfaces.


Robin.


Signed-off-by: Jason Gunthorpe 
Signed-off-by: Lu Baolu 
Tested-by: Dmitry Osipenko 
---
  include/linux/iommu.h | 13 +++
  drivers/iommu/iommu.c | 79 +++
  2 files changed, 92 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5ad4cf13370d..1bc03118dfb3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -703,6 +703,8 @@ int iommu_group_set_dma_owner(struct iommu_group *group, 
enum iommu_dma_owner ow
  void *owner_cookie);
  void iommu_group_release_dma_owner(struct iommu_group *group, enum 
iommu_dma_owner owner);
  bool iommu_group_dma_owner_unclaimed(struct iommu_group *group);
+int iommu_attach_device_shared(struct iommu_domain *domain, struct device 
*dev);
+void iommu_detach_device_shared(struct iommu_domain *domain, struct device 
*dev);
  
  #else /* CONFIG_IOMMU_API */
  
@@ -743,11 +745,22 @@ static inline int iommu_attach_device(struct iommu_domain *domain,

return -ENODEV;
  }
  
+static inline int iommu_attach_device_shared(struct iommu_domain *domain,

+struct device *dev)
+{
+   return -ENODEV;
+}
+
  static inline void iommu_detach_device(struct iommu_domain *domain,
   struct device *dev)
  {
  }
  
+static inline void iommu_detach_device_shared(struct iommu_domain *domain,

+ struct device *dev)
+{
+}
+
  static inline struct iommu_domain *iommu_get_domain_for_dev(struct device 
*dev)
  {
return NULL;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8bec71b1cc18..3ad66cb9bedc 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -50,6 +50,7 @@ struct iommu_group {
struct list_head entry;
enum iommu_dma_owner dma_owner;
unsigned int owner_cnt;
+   unsigned int attach_cnt;
void *owner_cookie;
  };
  
@@ -3512,3 +3513,81 @@ void iommu_device_release_dma_owner(struct device *dev, enum iommu_dma_owner own

iommu_group_put(group);
  }
  EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner);
+
+/**
+ * iommu_attach_device_shared() - Attach shared domain to a device
+ * @domain: The shared domain.
+ * @dev: The device.
+ *
+ * Similar to iommu_attach_device(), but allowed for shared-group devices
+ * and guarantees that all devices in an iommu group could only be attached
+ * by a same iommu domain. The caller should explicitly set the dma ownership
+ * of DMA_OWNER_PRIVATE_DOMAIN or DMA_OWNER_PRIVATE_DOMAIN_USER type before
+ * calling it and use the paired helper iommu_detach_device_shared() for
+ * cleanup.
+ */
+int iommu_attach_device_shared(struct iommu_domain *domain, struct device *dev)
+{
+   struct iommu_group *group;
+   int ret = 0;
+
+   group = iommu_group_get(dev);
+   if (!group)
+   return -ENODEV;
+
+   mutex_lock(>mutex);
+   if (group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN &&
+   group->dma_owner != DMA_OWNER_PRIVATE_DOMAIN_USER) {
+   ret = -EPERM;
+   goto unlock_out;
+   }
+
+   if 

Re: [PATCH v3 4/5] iommu/arm-smmu-v3: Add host support for NVIDIA Grace CMDQ-V

2021-12-20 Thread Robin Murphy

On 2021-11-19 07:19, Nicolin Chen wrote:

From: Nate Watterson 

NVIDIA's Grace Soc has a CMDQ-Virtualization (CMDQV) hardware,
which extends the standard ARM SMMU v3 IP to support multiple
VCMDQs with virtualization capabilities. In-kernel of host OS,
they're used to reduce contention on a single queue. In terms
of command queue, they are very like the standard CMDQ/ECMDQs,
but only support CS_NONE in the CS field of CMD_SYNC command.

This patch adds a new nvidia-grace-cmdqv file and inserts its
structure pointer into the existing arm_smmu_device, and then
adds related function calls in the arm-smmu-v3 driver.

In the CMDQV driver itself, this patch only adds minimal part
for host kernel support. Upon probe(), VINTF0 is reserved for
in-kernel use. And some of the VCMDQs are assigned to VINTF0.
Then the driver will select one of VCMDQs in the VINTF0 based
on the CPU currently executing, to issue commands.


Is there a tangible difference to DMA API or VFIO performance?

[...]

+struct arm_smmu_cmdq *nvidia_grace_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+{
+   struct nvidia_grace_cmdqv *cmdqv = smmu->nvidia_grace_cmdqv;
+   struct nvidia_grace_cmdqv_vintf *vintf0 = >vintf0;
+   u16 qidx;
+
+   /* Check error status of vintf0 */
+   if (!FIELD_GET(VINTF_STATUS, vintf0->status))
+   return >cmdq;
+
+   /*
+* Select a vcmdq to use. Here we use a temporal solution to
+* balance out traffic on cmdq issuing: each cmdq has its own
+* lock, if all cpus issue cmdlist using the same cmdq, only
+* one CPU at a time can enter the process, while the others
+* will be spinning at the same lock.
+*/
+   qidx = smp_processor_id() % cmdqv->num_vcmdqs_per_vintf;


How does ordering work between queues? Do they follow a global order 
such that a sync on any queue is guaranteed to complete all prior 
commands on all queues?


The challenge to make ECMDQ useful to Linux is how to make sure that all 
the commands expected to be within scope of a future CMND_SYNC plus that 
sync itself all get issued on the same queue, so I'd be mildly surprised 
if you didn't have the same problem.


Robin.


+   return >vcmdqs[qidx];
+}

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/5] iommu: Separate IOVA rcache memories from iova_domain structure

2021-12-20 Thread Robin Murphy

Hi John,

On 2021-12-20 08:49, John Garry wrote:

On 24/09/2021 11:01, John Garry wrote:

Only dma-iommu.c and vdpa actually use the "fast" mode of IOVA alloc and
free. As such, it's wasteful that all other IOVA domains hold the rcache
memories.

In addition, the current IOVA domain init implementation is poor
(init_iova_domain()), in that errors are ignored and not passed to the
caller. The only errors can come from the IOVA rcache init, and fixing up
all the IOVA domain init callsites to handle the errors would take some
work.

Separate the IOVA rache out of the IOVA domain, and create a new IOVA
domain structure, iova_caching_domain.

Signed-off-by: John Garry 


Hi Robin,

Do you have any thoughts on this patch? The decision is whether we stick 
with a single iova domain structure or support this super structure for 
iova domains which support the rcache. I did not try the former - it 
would be do-able but I am not sure on how it would look.


TBH I feel inclined to take the simpler approach of just splitting the 
rcache array to a separate allocation, making init_iova_rcaches() public 
(with a proper return value), and tweaking put_iova_domain() to make 
rcache cleanup conditional. A residual overhead of 3 extra pointers in 
iova_domain doesn't seem like *too* much for non-DMA-API users to bear. 
Unless you want to try generalising the rcache mechanism completely away 
from IOVA API specifics, it doesn't seem like there's really enough to 
justify the bother of having its own distinct abstraction layer.


Cheers,
Robin.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 4/5] iommu: Separate IOVA rcache memories from iova_domain structure

2021-12-20 Thread Robin Murphy

Hi John,

On 2021-12-20 08:49, John Garry wrote:

On 24/09/2021 11:01, John Garry wrote:

Only dma-iommu.c and vdpa actually use the "fast" mode of IOVA alloc and
free. As such, it's wasteful that all other IOVA domains hold the rcache
memories.

In addition, the current IOVA domain init implementation is poor
(init_iova_domain()), in that errors are ignored and not passed to the
caller. The only errors can come from the IOVA rcache init, and fixing up
all the IOVA domain init callsites to handle the errors would take some
work.

Separate the IOVA rache out of the IOVA domain, and create a new IOVA
domain structure, iova_caching_domain.

Signed-off-by: John Garry 


Hi Robin,

Do you have any thoughts on this patch? The decision is whether we stick 
with a single iova domain structure or support this super structure for 
iova domains which support the rcache. I did not try the former - it 
would be do-able but I am not sure on how it would look.


TBH I feel inclined to take the simpler approach of just splitting the 
rcache array to a separate allocation, making init_iova_rcaches() public 
(with a proper return value), and tweaking put_iova_domain() to make 
rcache cleanup conditional. A residual overhead of 3 extra pointers in 
iova_domain doesn't seem like *too* much for non-DMA-API users to bear. 
Unless you want to try generalising the rcache mechanism completely away 
from IOVA API specifics, it doesn't seem like there's really enough to 
justify the bother of having its own distinct abstraction layer.


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/iova: Temporarily include dma-mapping.h from iova.h

2021-12-20 Thread Robin Murphy

On 2021-12-20 12:34, Joerg Roedel wrote:

From: Joerg Roedel 

Some users of iova.h still expect that dma-mapping.h is also included.
Re-add the include until these users are updated to fix compile
failures in the iommu tree.


Acked-by: Robin Murphy 


Signed-off-by: Joerg Roedel 
---
  include/linux/iova.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 0abd48c5e622..cea79cb9f26c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -12,6 +12,7 @@
  #include 
  #include 
  #include 
+#include 
  
  /* iova structure */

  struct iova {

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Nouveau] [PATCH 1/1] drm/nouveau/device: Get right pgsize_bitmap of iommu_domain

2021-12-20 Thread Robin Murphy

On 2021-12-18 07:45, Lu Baolu wrote:

The suported page sizes of an iommu_domain are saved in the pgsize_bitmap
field. Retrieve the value from the right place.

Fixes: 58fd9375c2c534 ("drm/nouveau/platform: probe IOMMU if present")


...except domain->pgsize_bitmap was introduced more than a year after 
that commit ;)


As an improvement rather than a fix, though,

Reviewed-by: Robin Murphy 


Signed-off-by: Lu Baolu 
---
  drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c 
b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
index d0d52c1d4aee..992cc285f2fe 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
@@ -133,7 +133,7 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra 
*tdev)
 * or equal to the system's PAGE_SIZE, with a preference if
 * both are equal.
 */
-   pgsize_bitmap = tdev->iommu.domain->ops->pgsize_bitmap;
+   pgsize_bitmap = tdev->iommu.domain->pgsize_bitmap;
if (pgsize_bitmap & PAGE_SIZE) {
tdev->iommu.pgshift = PAGE_SHIFT;
} else {


Re: [PATCH 1/1] drm/nouveau/device: Get right pgsize_bitmap of iommu_domain

2021-12-20 Thread Robin Murphy

On 2021-12-18 07:45, Lu Baolu wrote:

The suported page sizes of an iommu_domain are saved in the pgsize_bitmap
field. Retrieve the value from the right place.

Fixes: 58fd9375c2c534 ("drm/nouveau/platform: probe IOMMU if present")


...except domain->pgsize_bitmap was introduced more than a year after 
that commit ;)


As an improvement rather than a fix, though,

Reviewed-by: Robin Murphy 


Signed-off-by: Lu Baolu 
---
  drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c 
b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
index d0d52c1d4aee..992cc285f2fe 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
@@ -133,7 +133,7 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra 
*tdev)
 * or equal to the system's PAGE_SIZE, with a preference if
 * both are equal.
 */
-   pgsize_bitmap = tdev->iommu.domain->ops->pgsize_bitmap;
+   pgsize_bitmap = tdev->iommu.domain->pgsize_bitmap;
if (pgsize_bitmap & PAGE_SIZE) {
tdev->iommu.pgshift = PAGE_SHIFT;
} else {

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/1] drm/nouveau/device: Get right pgsize_bitmap of iommu_domain

2021-12-20 Thread Robin Murphy

On 2021-12-18 07:45, Lu Baolu wrote:

The suported page sizes of an iommu_domain are saved in the pgsize_bitmap
field. Retrieve the value from the right place.

Fixes: 58fd9375c2c534 ("drm/nouveau/platform: probe IOMMU if present")


...except domain->pgsize_bitmap was introduced more than a year after 
that commit ;)


As an improvement rather than a fix, though,

Reviewed-by: Robin Murphy 


Signed-off-by: Lu Baolu 
---
  drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c 
b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
index d0d52c1d4aee..992cc285f2fe 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
@@ -133,7 +133,7 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra 
*tdev)
 * or equal to the system's PAGE_SIZE, with a preference if
 * both are equal.
 */
-   pgsize_bitmap = tdev->iommu.domain->ops->pgsize_bitmap;
+   pgsize_bitmap = tdev->iommu.domain->pgsize_bitmap;
if (pgsize_bitmap & PAGE_SIZE) {
tdev->iommu.pgshift = PAGE_SHIFT;
} else {


[PATCH v3 9/9] iommu: Move flush queue data into iommu_dma_cookie

2021-12-17 Thread Robin Murphy
Complete the move into iommu-dma by refactoring the flush queues
themselves to belong to the DMA cookie rather than the IOVA domain.

The refactoring may as well extend to some minor cosmetic aspects
too, to help us stay one step ahead of the style police.

Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 171 +-
 drivers/iommu/iova.c  |   2 -
 include/linux/iova.h  |  44 +-
 3 files changed, 95 insertions(+), 122 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 54be57fb08f0..b0568fa848e9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -9,9 +9,12 @@
  */
 
 #include 
+#include 
+#include 
 #include 
-#include 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -20,11 +23,10 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
+#include 
 #include 
-#include 
-#include 
 
 struct iommu_dma_msi_page {
struct list_headlist;
@@ -41,7 +43,19 @@ struct iommu_dma_cookie {
enum iommu_dma_cookie_type  type;
union {
/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
-   struct iova_domain  iovad;
+   struct {
+   struct iova_domain  iovad;
+
+   struct iova_fq __percpu *fq;/* Flush queue */
+   /* Number of TLB flushes that have been started */
+   atomic64_t  fq_flush_start_cnt;
+   /* Number of TLB flushes that have been finished */
+   atomic64_t  fq_flush_finish_cnt;
+   /* Timer to regularily empty the flush queues */
+   struct timer_list   fq_timer;
+   /* 1 when timer is active, 0 when not */
+   atomic_tfq_timer_on;
+   };
/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
dma_addr_t  msi_iova;
};
@@ -64,6 +78,27 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
+/* Number of entries per flush queue */
+#define IOVA_FQ_SIZE   256
+
+/* Timeout (in ms) after which entries are flushed from the queue */
+#define IOVA_FQ_TIMEOUT10
+
+/* Flush queue entry for deferred flushing */
+struct iova_fq_entry {
+   unsigned long iova_pfn;
+   unsigned long pages;
+   struct list_head freelist;
+   u64 counter; /* Flush counter when this entry was added */
+};
+
+/* Per-CPU flush queue structure */
+struct iova_fq {
+   struct iova_fq_entry entries[IOVA_FQ_SIZE];
+   unsigned int head, tail;
+   spinlock_t lock;
+};
+
 #define fq_ring_for_each(i, fq) \
for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % 
IOVA_FQ_SIZE)
 
@@ -73,9 +108,9 @@ static inline bool fq_full(struct iova_fq *fq)
return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
 }
 
-static inline unsigned fq_ring_add(struct iova_fq *fq)
+static inline unsigned int fq_ring_add(struct iova_fq *fq)
 {
-   unsigned idx = fq->tail;
+   unsigned int idx = fq->tail;
 
assert_spin_locked(>lock);
 
@@ -84,10 +119,10 @@ static inline unsigned fq_ring_add(struct iova_fq *fq)
return idx;
 }
 
-static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
 {
-   u64 counter = atomic64_read(>fq_flush_finish_cnt);
-   unsigned idx;
+   u64 counter = atomic64_read(>fq_flush_finish_cnt);
+   unsigned int idx;
 
assert_spin_locked(>lock);
 
@@ -97,7 +132,7 @@ static void fq_ring_free(struct iova_domain *iovad, struct 
iova_fq *fq)
break;
 
put_pages_list(>entries[idx].freelist);
-   free_iova_fast(iovad,
+   free_iova_fast(>iovad,
   fq->entries[idx].iova_pfn,
   fq->entries[idx].pages);
 
@@ -105,50 +140,50 @@ static void fq_ring_free(struct iova_domain *iovad, 
struct iova_fq *fq)
}
 }
 
-static void iova_domain_flush(struct iova_domain *iovad)
+static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
 {
-   atomic64_inc(>fq_flush_start_cnt);
-   iovad->fq_domain->ops->flush_iotlb_all(iovad->fq_domain);
-   atomic64_inc(>fq_flush_finish_cnt);
+   atomic64_inc(>fq_flush_start_cnt);
+   cookie->fq_domain->ops->flush_iotlb_all(cookie->fq_domain);
+   atomic64_inc(>fq_flush_finish_cnt);
 }
 
 static void fq_flush_timeout(struct timer_list *t)
 {
-   struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+   struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
int cpu;
 
-   atomic_se

[PATCH v3 8/9] iommu/iova: Move flush queue code to iommu-dma

2021-12-17 Thread Robin Murphy
Flush queues are specific to DMA ops, which are now handled exclusively
by iommu-dma. As such, now that the historical artefacts from being
shared directly with drivers have been cleaned up, move the flush queue
code into iommu-dma itself to get it out of the way of other IOVA users.

This is pure code movement with no functional change; refactoring to
clean up the headers and definitions will follow.

Reviewed-by: John Garry 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 177 +-
 drivers/iommu/iova.c  | 175 -
 2 files changed, 176 insertions(+), 176 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f139b77caee0..54be57fb08f0 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -64,6 +64,179 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
+#define fq_ring_for_each(i, fq) \
+   for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % 
IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+   assert_spin_locked(>lock);
+   return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+   unsigned idx = fq->tail;
+
+   assert_spin_locked(>lock);
+
+   fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+   return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+   u64 counter = atomic64_read(>fq_flush_finish_cnt);
+   unsigned idx;
+
+   assert_spin_locked(>lock);
+
+   fq_ring_for_each(idx, fq) {
+
+   if (fq->entries[idx].counter >= counter)
+   break;
+
+   put_pages_list(>entries[idx].freelist);
+   free_iova_fast(iovad,
+  fq->entries[idx].iova_pfn,
+  fq->entries[idx].pages);
+
+   fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
+   }
+}
+
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+   atomic64_inc(>fq_flush_start_cnt);
+   iovad->fq_domain->ops->flush_iotlb_all(iovad->fq_domain);
+   atomic64_inc(>fq_flush_finish_cnt);
+}
+
+static void fq_flush_timeout(struct timer_list *t)
+{
+   struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+   int cpu;
+
+   atomic_set(>fq_timer_on, 0);
+   iova_domain_flush(iovad);
+
+   for_each_possible_cpu(cpu) {
+   unsigned long flags;
+   struct iova_fq *fq;
+
+   fq = per_cpu_ptr(iovad->fq, cpu);
+   spin_lock_irqsave(>lock, flags);
+   fq_ring_free(iovad, fq);
+   spin_unlock_irqrestore(>lock, flags);
+   }
+}
+
+void queue_iova(struct iova_domain *iovad,
+   unsigned long pfn, unsigned long pages,
+   struct list_head *freelist)
+{
+   struct iova_fq *fq;
+   unsigned long flags;
+   unsigned idx;
+
+   /*
+* Order against the IOMMU driver's pagetable update from unmapping
+* @pte, to guarantee that iova_domain_flush() observes that if called
+* from a different CPU before we release the lock below. Full barrier
+* so it also pairs with iommu_dma_init_fq() to avoid seeing partially
+* written fq state here.
+*/
+   smp_mb();
+
+   fq = raw_cpu_ptr(iovad->fq);
+   spin_lock_irqsave(>lock, flags);
+
+   /*
+* First remove all entries from the flush queue that have already been
+* flushed out on another CPU. This makes the fq_full() check below less
+* likely to be true.
+*/
+   fq_ring_free(iovad, fq);
+
+   if (fq_full(fq)) {
+   iova_domain_flush(iovad);
+   fq_ring_free(iovad, fq);
+   }
+
+   idx = fq_ring_add(fq);
+
+   fq->entries[idx].iova_pfn = pfn;
+   fq->entries[idx].pages= pages;
+   fq->entries[idx].counter  = atomic64_read(>fq_flush_start_cnt);
+   list_splice(freelist, >entries[idx].freelist);
+
+   spin_unlock_irqrestore(>lock, flags);
+
+   /* Avoid false sharing as much as possible. */
+   if (!atomic_read(>fq_timer_on) &&
+   !atomic_xchg(>fq_timer_on, 1))
+   mod_timer(>fq_timer,
+ jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+}
+
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+   int cpu, idx;
+
+   if (!iovad->fq)
+   return;
+
+   del_timer_sync(>fq_timer);
+   /*
+* This code runs when the iova_domain is being detroyed, so don't
+* bother to free iovas, just free any remaining pagetable pages.
+*/
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *f

[PATCH v3 7/9] iommu/iova: Consolidate flush queue code

2021-12-17 Thread Robin Murphy
Squash and simplify some of the freeing code, and move the init
and free routines down into the rest of the flush queue code to
obviate the forward declarations.

Reviewed-by: John Garry 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/iova.c | 131 +++
 1 file changed, 58 insertions(+), 73 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a0250cebcdcf..c19f9a749070 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,8 +24,6 @@ static unsigned long iova_rcache_get(struct iova_domain 
*iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
-static void fq_destroy_all_entries(struct iova_domain *iovad);
-static void fq_flush_timeout(struct timer_list *t);
 
 static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
 {
@@ -73,60 +71,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
-static bool has_iova_flush_queue(struct iova_domain *iovad)
-{
-   return !!iovad->fq;
-}
-
-static void free_iova_flush_queue(struct iova_domain *iovad)
-{
-   if (!has_iova_flush_queue(iovad))
-   return;
-
-   del_timer_sync(>fq_timer);
-
-   fq_destroy_all_entries(iovad);
-
-   free_percpu(iovad->fq);
-
-   iovad->fq = NULL;
-   iovad->fq_domain  = NULL;
-}
-
-int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain)
-{
-   struct iova_fq __percpu *queue;
-   int i, cpu;
-
-   atomic64_set(>fq_flush_start_cnt,  0);
-   atomic64_set(>fq_flush_finish_cnt, 0);
-
-   queue = alloc_percpu(struct iova_fq);
-   if (!queue)
-   return -ENOMEM;
-
-   for_each_possible_cpu(cpu) {
-   struct iova_fq *fq;
-
-   fq = per_cpu_ptr(queue, cpu);
-   fq->head = 0;
-   fq->tail = 0;
-
-   spin_lock_init(>lock);
-
-   for (i = 0; i < IOVA_FQ_SIZE; i++)
-   INIT_LIST_HEAD(>entries[i].freelist);
-   }
-
-   iovad->fq_domain = fq_domain;
-   iovad->fq = queue;
-
-   timer_setup(>fq_timer, fq_flush_timeout, 0);
-   atomic_set(>fq_timer_on, 0);
-
-   return 0;
-}
-
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
 {
@@ -585,23 +529,6 @@ static void iova_domain_flush(struct iova_domain *iovad)
atomic64_inc(>fq_flush_finish_cnt);
 }
 
-static void fq_destroy_all_entries(struct iova_domain *iovad)
-{
-   int cpu;
-
-   /*
-* This code runs when the iova_domain is being detroyed, so don't
-* bother to free iovas, just free any remaining pagetable pages.
-*/
-   for_each_possible_cpu(cpu) {
-   struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
-   int idx;
-
-   fq_ring_for_each(idx, fq)
-   put_pages_list(>entries[idx].freelist);
-   }
-}
-
 static void fq_flush_timeout(struct timer_list *t)
 {
struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
@@ -669,6 +596,64 @@ void queue_iova(struct iova_domain *iovad,
  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
 
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+   int cpu, idx;
+
+   if (!iovad->fq)
+   return;
+
+   del_timer_sync(>fq_timer);
+   /*
+* This code runs when the iova_domain is being detroyed, so don't
+* bother to free iovas, just free any remaining pagetable pages.
+*/
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+
+   fq_ring_for_each(idx, fq)
+   put_pages_list(>entries[idx].freelist);
+   }
+
+   free_percpu(iovad->fq);
+
+   iovad->fq = NULL;
+   iovad->fq_domain = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain)
+{
+   struct iova_fq __percpu *queue;
+   int i, cpu;
+
+   atomic64_set(>fq_flush_start_cnt,  0);
+   atomic64_set(>fq_flush_finish_cnt, 0);
+
+   queue = alloc_percpu(struct iova_fq);
+   if (!queue)
+   return -ENOMEM;
+
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *fq = per_cpu_ptr(queue, cpu);
+
+   fq->head = 0;
+   fq->tail = 0;
+
+   spin_lock_init(>lock);
+
+   for (i = 0; i < IOVA_FQ_SIZE; i++)
+   INIT_LIST_HEAD(>entries[i].freelist);
+   }
+
+   iovad->fq_domain = fq_domain;
+   iovad->fq = queue;
+
+   timer_setup(>fq_timer, fq_flush_timeout, 0);
+   atomic_set(>fq_timer

[PATCH v3 6/9] iommu/vt-d: Use put_pages_list

2021-12-17 Thread Robin Murphy
From: "Matthew Wilcox (Oracle)" 

page->freelist is for the use of slab.  We already have the ability
to free a list of pages in the core mm, but it requires the use of a
list_head and for the pages to be chained together through page->lru.
Switch the Intel IOMMU and IOVA code over to using free_pages_list().

Signed-off-by: Matthew Wilcox (Oracle) 
[rm: split from original patch, cosmetic tweaks, fix fq entries]
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c   |  2 +-
 drivers/iommu/intel/iommu.c | 89 +
 drivers/iommu/iova.c| 26 ---
 include/linux/iommu.h   |  3 +-
 include/linux/iova.h|  4 +-
 5 files changed, 45 insertions(+), 79 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index cde887530549..f139b77caee0 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -455,7 +455,7 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie 
*cookie,
else if (gather && gather->queued)
queue_iova(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad),
-   gather->freelist);
+   >freelist);
else
free_iova_fast(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad));
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b6a8f3282411..17b3d97111f3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1303,35 +1303,30 @@ static void dma_pte_free_pagetable(struct dmar_domain 
*domain,
know the hardware page-walk will no longer touch them.
The 'pte' argument is the *parent* PTE, pointing to the page that is to
be freed. */
-static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
-   int level, struct dma_pte *pte,
-   struct page *freelist)
+static void dma_pte_list_pagetables(struct dmar_domain *domain,
+   int level, struct dma_pte *pte,
+   struct list_head *freelist)
 {
struct page *pg;
 
pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
-   pg->freelist = freelist;
-   freelist = pg;
+   list_add_tail(>lru, freelist);
 
if (level == 1)
-   return freelist;
+   return;
 
pte = page_address(pg);
do {
if (dma_pte_present(pte) && !dma_pte_superpage(pte))
-   freelist = dma_pte_list_pagetables(domain, level - 1,
-  pte, freelist);
+   dma_pte_list_pagetables(domain, level - 1, pte, 
freelist);
pte++;
} while (!first_pte_in_page(pte));
-
-   return freelist;
 }
 
-static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
-   struct dma_pte *pte, unsigned long pfn,
-   unsigned long start_pfn,
-   unsigned long last_pfn,
-   struct page *freelist)
+static void dma_pte_clear_level(struct dmar_domain *domain, int level,
+   struct dma_pte *pte, unsigned long pfn,
+   unsigned long start_pfn, unsigned long last_pfn,
+   struct list_head *freelist)
 {
struct dma_pte *first_pte = NULL, *last_pte = NULL;
 
@@ -1350,7 +1345,7 @@ static struct page *dma_pte_clear_level(struct 
dmar_domain *domain, int level,
/* These suborbinate page tables are going away 
entirely. Don't
   bother to clear them; we're just going to *free* 
them. */
if (level > 1 && !dma_pte_superpage(pte))
-   freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+   dma_pte_list_pagetables(domain, level - 1, pte, 
freelist);
 
dma_clear_pte(pte);
if (!first_pte)
@@ -1358,10 +1353,10 @@ static struct page *dma_pte_clear_level(struct 
dmar_domain *domain, int level,
last_pte = pte;
} else if (level > 1) {
/* Recurse down into a level that isn't *entirely* 
obsolete */
-   freelist = dma_pte_clear_level(domain, level - 1,
-  
phys_to_virt(dma_pte_addr(pte)),
-  level_pfn, start_pfn, 
last_pfn,
-  freelist);
+   dma_pte_clear_level(domain, level

[PATCH v3 5/9] iommu/amd: Use put_pages_list

2021-12-17 Thread Robin Murphy
From: "Matthew Wilcox (Oracle)" 

page->freelist is for the use of slab.  We already have the ability
to free a list of pages in the core mm, but it requires the use of a
list_head and for the pages to be chained together through page->lru.
Switch the AMD IOMMU code over to using free_pages_list().

Signed-off-by: Matthew Wilcox (Oracle) 
[rm: split from original patch, cosmetic tweaks]
Signed-off-by: Robin Murphy 
---
 drivers/iommu/amd/io_pgtable.c | 50 --
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 4165e1372b6e..b1bf4125b0f7 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -74,26 +74,14 @@ static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
  *
  /
 
-static void free_page_list(struct page *freelist)
-{
-   while (freelist != NULL) {
-   unsigned long p = (unsigned long)page_address(freelist);
-
-   freelist = freelist->freelist;
-   free_page(p);
-   }
-}
-
-static struct page *free_pt_page(u64 *pt, struct page *freelist)
+static void free_pt_page(u64 *pt, struct list_head *freelist)
 {
struct page *p = virt_to_page(pt);
 
-   p->freelist = freelist;
-
-   return p;
+   list_add_tail(>lru, freelist);
 }
 
-static struct page *free_pt_lvl(u64 *pt, struct page *freelist, int lvl)
+static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
 {
u64 *p;
int i;
@@ -114,22 +102,22 @@ static struct page *free_pt_lvl(u64 *pt, struct page 
*freelist, int lvl)
 */
p = IOMMU_PTE_PAGE(pt[i]);
if (lvl > 2)
-   freelist = free_pt_lvl(p, freelist, lvl - 1);
+   free_pt_lvl(p, freelist, lvl - 1);
else
-   freelist = free_pt_page(p, freelist);
+   free_pt_page(p, freelist);
}
 
-   return free_pt_page(pt, freelist);
+   free_pt_page(pt, freelist);
 }
 
-static struct page *free_sub_pt(u64 *root, int mode, struct page *freelist)
+static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
 {
switch (mode) {
case PAGE_MODE_NONE:
case PAGE_MODE_7_LEVEL:
break;
case PAGE_MODE_1_LEVEL:
-   freelist = free_pt_page(root, freelist);
+   free_pt_page(root, freelist);
break;
case PAGE_MODE_2_LEVEL:
case PAGE_MODE_3_LEVEL:
@@ -141,8 +129,6 @@ static struct page *free_sub_pt(u64 *root, int mode, struct 
page *freelist)
default:
BUG();
}
-
-   return freelist;
 }
 
 void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
@@ -350,7 +336,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
return pte;
 }
 
-static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist)
+static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
 {
u64 *pt;
int mode;
@@ -361,12 +347,12 @@ static struct page *free_clear_pte(u64 *pte, u64 pteval, 
struct page *freelist)
}
 
if (!IOMMU_PTE_PRESENT(pteval))
-   return freelist;
+   return;
 
pt   = IOMMU_PTE_PAGE(pteval);
mode = IOMMU_PTE_MODE(pteval);
 
-   return free_sub_pt(pt, mode, freelist);
+   free_sub_pt(pt, mode, freelist);
 }
 
 /*
@@ -380,7 +366,7 @@ static int iommu_v1_map_page(struct io_pgtable_ops *ops, 
unsigned long iova,
  phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
-   struct page *freelist = NULL;
+   LIST_HEAD(freelist);
bool updated = false;
u64 __pte, *pte;
int ret, i, count;
@@ -400,9 +386,9 @@ static int iommu_v1_map_page(struct io_pgtable_ops *ops, 
unsigned long iova,
goto out;
 
for (i = 0; i < count; ++i)
-   freelist = free_clear_pte([i], pte[i], freelist);
+   free_clear_pte([i], pte[i], );
 
-   if (freelist != NULL)
+   if (!list_empty())
updated = true;
 
if (count > 1) {
@@ -437,7 +423,7 @@ static int iommu_v1_map_page(struct io_pgtable_ops *ops, 
unsigned long iova,
}
 
/* Everything flushed out, free pages now */
-   free_page_list(freelist);
+   put_pages_list();
 
return ret;
 }
@@ -499,7 +485,7 @@ static void v1_free_pgtable(struct io_pgtable *iop)
 {
struct amd_io_pgtable *pgtable = container_of(iop, struct 
amd_io_pgtable, iop);
struct protection_domain *dom;
-   struct page *freelist = NULL;
+   LIST_HEAD(freelist);
 
if (pgtable->mode == PAGE_MODE_NONE)
re

[PATCH v3 4/9] iommu/amd: Simplify pagetable freeing

2021-12-17 Thread Robin Murphy
For reasons unclear, pagetable freeing is an effectively recursive
method implemented via an elaborate system of templated functions that
turns out to account for 25% of the object file size. Implementing it
using regular straightforward recursion makes the code simpler, and
seems like a good thing to do before we work on it further. As part of
that, also fix the types to avoid all the needless casting back and
forth which just gets in the way.

Signed-off-by: Robin Murphy 
---
 drivers/iommu/amd/io_pgtable.c | 82 ++
 1 file changed, 34 insertions(+), 48 deletions(-)

diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 182c93a43efd..4165e1372b6e 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -84,49 +84,45 @@ static void free_page_list(struct page *freelist)
}
 }
 
-static struct page *free_pt_page(unsigned long pt, struct page *freelist)
+static struct page *free_pt_page(u64 *pt, struct page *freelist)
 {
-   struct page *p = virt_to_page((void *)pt);
+   struct page *p = virt_to_page(pt);
 
p->freelist = freelist;
 
return p;
 }
 
-#define DEFINE_FREE_PT_FN(LVL, FN) 
\
-static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist)  
\
-{  
\
-   unsigned long p;
\
-   u64 *pt;
\
-   int i;  
\
-   
\
-   pt = (u64 *)__pt;   
\
-   
\
-   for (i = 0; i < 512; ++i) { 
\
-   /* PTE present? */  
\
-   if (!IOMMU_PTE_PRESENT(pt[i]))  
\
-   continue;   
\
-   
\
-   /* Large PTE? */
\
-   if (PM_PTE_LEVEL(pt[i]) == 0 || 
\
-   PM_PTE_LEVEL(pt[i]) == 7)   
\
-   continue;   
\
-   
\
-   p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);   
\
-   freelist = FN(p, freelist); 
\
-   }   
\
-   
\
-   return free_pt_page((unsigned long)pt, freelist);   
\
+static struct page *free_pt_lvl(u64 *pt, struct page *freelist, int lvl)
+{
+   u64 *p;
+   int i;
+
+   for (i = 0; i < 512; ++i) {
+   /* PTE present? */
+   if (!IOMMU_PTE_PRESENT(pt[i]))
+   continue;
+
+   /* Large PTE? */
+   if (PM_PTE_LEVEL(pt[i]) == 0 ||
+   PM_PTE_LEVEL(pt[i]) == 7)
+   continue;
+
+   /*
+* Free the next level. No need to look at l1 tables here since
+* they can only contain leaf PTEs; just free them directly.
+*/
+   p = IOMMU_PTE_PAGE(pt[i]);
+   if (lvl > 2)
+   freelist = free_pt_lvl(p, freelist, lvl - 1);
+   else
+   freelist = free_pt_page(p, freelist);
+   }
+
+   return free_pt_page(pt, freelist);
 }
 
-DEFINE_FREE_PT_FN(l2, free_pt_page)
-DEFINE_FREE_PT_FN(l3, free_pt_l2)
-DEFINE_FREE_PT_FN(l4, free_pt_l3)
-DEFINE_FREE_PT_FN(l5, free_pt_l4)
-DEFINE_FREE_PT_FN(l6, free_pt_l5)
-
-static struct page *free_sub_pt(unsigned long root, int mode,
-   struct page *freelist)
+static struct page *free_sub_pt(u64 *root, int mode, struct page *freelist)
 {
switch (mode) {
case PAGE_MODE_NONE:
@@ -136,19 +132,11 @@ static struct page *free_sub_pt(unsigned long root, int 
mode,
freelist = free_pt_page(root, freelist);
break;
case PAGE_MODE_2_LEVEL:
-   freelist = free_pt_l2(root, freelist);
-   break;
case PAGE_MODE_3_LEVEL:
-   freelist = free_pt_l3(root, freelist);
-   break;
case PAGE_MODE_4_LEVEL:
-   freelist = free_pt_l4(ro

[PATCH v3 3/9] iommu/iova: Squash flush_cb abstraction

2021-12-17 Thread Robin Murphy
Once again, with iommu-dma now being the only flush queue user, we no
longer need the extra level of indirection through flush_cb. Squash that
and let the flush queue code call the domain method directly. This does
mean temporarily having to carry an additional copy of the IOMMU domain
pointer around instead, but only until a later patch untangles it again.

Reviewed-by: John Garry 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 13 +
 drivers/iommu/iova.c  | 11 +--
 include/linux/iova.h  | 11 +++
 3 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index fa21b9141b71..cde887530549 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -282,17 +282,6 @@ static int iova_reserve_iommu_regions(struct device *dev,
return ret;
 }
 
-static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
-{
-   struct iommu_dma_cookie *cookie;
-   struct iommu_domain *domain;
-
-   cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
-   domain = cookie->fq_domain;
-
-   domain->ops->flush_iotlb_all(domain);
-}
-
 static bool dev_is_untrusted(struct device *dev)
 {
return dev_is_pci(dev) && to_pci_dev(dev)->untrusted;
@@ -312,7 +301,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
if (cookie->fq_domain)
return 0;
 
-   ret = init_iova_flush_queue(>iovad, iommu_dma_flush_iotlb_all);
+   ret = init_iova_flush_queue(>iovad, domain);
if (ret) {
pr_warn("iova flush queue initialization failed\n");
return ret;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 280dd0c7fe1b..76bc6f39fac7 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -63,7 +63,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
iovad->max32_alloc_size = iovad->dma_32bit_pfn;
-   iovad->flush_cb = NULL;
+   iovad->fq_domain = NULL;
iovad->fq = NULL;
iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
rb_link_node(>anchor.node, NULL, >rbroot.rb_node);
@@ -90,10 +90,10 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
free_percpu(iovad->fq);
 
iovad->fq = NULL;
-   iovad->flush_cb   = NULL;
+   iovad->fq_domain  = NULL;
 }
 
-int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb)
+int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain)
 {
struct iova_fq __percpu *queue;
int cpu;
@@ -105,8 +105,6 @@ int init_iova_flush_queue(struct iova_domain *iovad, 
iova_flush_cb flush_cb)
if (!queue)
return -ENOMEM;
 
-   iovad->flush_cb   = flush_cb;
-
for_each_possible_cpu(cpu) {
struct iova_fq *fq;
 
@@ -117,6 +115,7 @@ int init_iova_flush_queue(struct iova_domain *iovad, 
iova_flush_cb flush_cb)
spin_lock_init(>lock);
}
 
+   iovad->fq_domain = fq_domain;
iovad->fq = queue;
 
timer_setup(>fq_timer, fq_flush_timeout, 0);
@@ -589,7 +588,7 @@ static void fq_ring_free(struct iova_domain *iovad, struct 
iova_fq *fq)
 static void iova_domain_flush(struct iova_domain *iovad)
 {
atomic64_inc(>fq_flush_start_cnt);
-   iovad->flush_cb(iovad);
+   iovad->fq_domain->ops->flush_iotlb_all(iovad->fq_domain);
atomic64_inc(>fq_flush_finish_cnt);
 }
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index e746d8e41449..99be4fcea4f3 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* iova structure */
 struct iova {
@@ -35,11 +36,6 @@ struct iova_rcache {
struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
-struct iova_domain;
-
-/* Call-Back from IOVA code into IOMMU drivers */
-typedef void (* iova_flush_cb)(struct iova_domain *domain);
-
 /* Number of entries per Flush Queue */
 #define IOVA_FQ_SIZE   256
 
@@ -82,8 +78,7 @@ struct iova_domain {
struct iova anchor; /* rbtree lookup anchor */
struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];  /* IOVA range 
caches */
 
-   iova_flush_cb   flush_cb;   /* Call-Back function to flush IOMMU
-  TLBs */
+   struct iommu_domain *fq_domain;
 
struct timer_list fq_timer; /* Timer to regularily empty the
   flush-queues */
@@ -147,7 +142,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, 
unsigned long pfn_lo,
unsigned long pfn_hi);
 void init_iov

[PATCH v3 2/9] iommu/iova: Squash entry_dtor abstraction

2021-12-17 Thread Robin Murphy
All flush queues are driven by iommu-dma now, so there is no need to
abstract entry_dtor or its data any more. Squash the now-canonical
implementation directly into the IOVA code to get it out of the way.

Reviewed-by: John Garry 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 17 ++---
 drivers/iommu/iova.c  | 28 +++-
 include/linux/iova.h  | 26 +++---
 3 files changed, 20 insertions(+), 51 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b42e38a0dbe2..fa21b9141b71 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -64,18 +64,6 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
-static void iommu_dma_entry_dtor(unsigned long data)
-{
-   struct page *freelist = (struct page *)data;
-
-   while (freelist) {
-   unsigned long p = (unsigned long)page_address(freelist);
-
-   freelist = freelist->freelist;
-   free_page(p);
-   }
-}
-
 static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
 {
if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
@@ -324,8 +312,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
if (cookie->fq_domain)
return 0;
 
-   ret = init_iova_flush_queue(>iovad, iommu_dma_flush_iotlb_all,
-   iommu_dma_entry_dtor);
+   ret = init_iova_flush_queue(>iovad, iommu_dma_flush_iotlb_all);
if (ret) {
pr_warn("iova flush queue initialization failed\n");
return ret;
@@ -479,7 +466,7 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie 
*cookie,
else if (gather && gather->queued)
queue_iova(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad),
-   (unsigned long)gather->freelist);
+   gather->freelist);
else
free_iova_fast(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad));
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 920fcc27c9a1..280dd0c7fe1b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -91,11 +91,9 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
 
iovad->fq = NULL;
iovad->flush_cb   = NULL;
-   iovad->entry_dtor = NULL;
 }
 
-int init_iova_flush_queue(struct iova_domain *iovad,
- iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
+int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb)
 {
struct iova_fq __percpu *queue;
int cpu;
@@ -108,7 +106,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
return -ENOMEM;
 
iovad->flush_cb   = flush_cb;
-   iovad->entry_dtor = entry_dtor;
 
for_each_possible_cpu(cpu) {
struct iova_fq *fq;
@@ -538,6 +535,16 @@ free_iova_fast(struct iova_domain *iovad, unsigned long 
pfn, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(free_iova_fast);
 
+static void fq_entry_dtor(struct page *freelist)
+{
+   while (freelist) {
+   unsigned long p = (unsigned long)page_address(freelist);
+
+   freelist = freelist->freelist;
+   free_page(p);
+   }
+}
+
 #define fq_ring_for_each(i, fq) \
for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % 
IOVA_FQ_SIZE)
 
@@ -570,9 +577,7 @@ static void fq_ring_free(struct iova_domain *iovad, struct 
iova_fq *fq)
if (fq->entries[idx].counter >= counter)
break;
 
-   if (iovad->entry_dtor)
-   iovad->entry_dtor(fq->entries[idx].data);
-
+   fq_entry_dtor(fq->entries[idx].freelist);
free_iova_fast(iovad,
   fq->entries[idx].iova_pfn,
   fq->entries[idx].pages);
@@ -597,15 +602,12 @@ static void fq_destroy_all_entries(struct iova_domain 
*iovad)
 * bother to free iovas, just call the entry_dtor on all remaining
 * entries.
 */
-   if (!iovad->entry_dtor)
-   return;
-
for_each_possible_cpu(cpu) {
struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
int idx;
 
fq_ring_for_each(idx, fq)
-   iovad->entry_dtor(fq->entries[idx].data);
+   fq_entry_dtor(fq->entries[idx].freelist);
}
 }
 
@@ -630,7 +632,7 @@ static void fq_flush_timeout(struct timer_list *t)
 
 void queue_iova(struct iova_domain *iovad,
unsigned long pfn, unsigned long pages,
-   unsigned long data)
+  

[PATCH v3 1/9] iommu/iova: Fix race between FQ timeout and teardown

2021-12-17 Thread Robin Murphy
From: Xiongfeng Wang 

It turns out to be possible for hotplugging out a device to reach the
stage of tearing down the device's group and default domain before the
domain's flush queue has drained naturally. At this point, it is then
possible for the timeout to expire just before the del_timer() call
in free_iova_flush_queue(), such that we then proceed to free the FQ
resources while fq_flush_timeout() is still accessing them on another
CPU. Crashes due to this have been observed in the wild while removing
NVMe devices.

Close the race window by using del_timer_sync() to safely wait for any
active timeout handler to finish before we start to free things. We
already avoid any locking in free_iova_flush_queue() since the FQ is
supposed to be inactive anyway, so the potential deadlock scenario does
not apply.

Fixes: 9a005a800ae8 ("iommu/iova: Add flush timer")
Reviewed-by: John Garry 
Signed-off-by: Xiongfeng Wang 
[ rm: rewrite commit message ]
Signed-off-by: Robin Murphy 
---
 drivers/iommu/iova.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 9e8bc802ac05..920fcc27c9a1 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -83,8 +83,7 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
if (!has_iova_flush_queue(iovad))
return;
 
-   if (timer_pending(>fq_timer))
-   del_timer(>fq_timer);
+   del_timer_sync(>fq_timer);
 
fq_destroy_all_entries(iovad);
 
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 0/9] iommu: refactor flush queues into iommu-dma

2021-12-17 Thread Robin Murphy
v1: 
https://lore.kernel.org/linux-iommu/cover.1637671820.git.robin.mur...@arm.com/
v2: 
https://lore.kernel.org/linux-iommu/cover.1639157090.git.robin.mur...@arm.com/

Hi all,

Just another quick update addressing the trivial nits and picking up the
review tags so far. The previous Tegra DRM fixes for the initial kbuild
issues have been picked up, so I've posted a final fix separately[1] for
the others subsequently reported on v2. I've confirmed that arm64
allmodconfig builds cleanly with that, and nothing else jumped out from
a manual audit of iova.h includers. At worst we could hold off applying
the last patch (or the last two, logically), or temporarily reinstate
the dma-iommu.h include, if we're worried about issues in linux-next
until the DRM tree has caught up.

Thanks
Robin.

[1] 
https://lore.kernel.org/linux-iommu/dc81eec74be9064e33247257b1fe439b0f6ec78d.1639664721.git.robin.mur...@arm.com/

Matthew Wilcox (Oracle) (2):
  iommu/amd: Use put_pages_list
  iommu/vt-d: Use put_pages_list

Robin Murphy (6):
  iommu/iova: Squash entry_dtor abstraction
  iommu/iova: Squash flush_cb abstraction
  iommu/amd: Simplify pagetable freeing
  iommu/iova: Consolidate flush queue code
  iommu/iova: Move flush queue code to iommu-dma
  iommu: Move flush queue data into iommu_dma_cookie

Xiongfeng Wang (1):
  iommu/iova: Fix race between FQ timeout and teardown

 drivers/iommu/amd/io_pgtable.c | 120 ++-
 drivers/iommu/dma-iommu.c  | 266 +++--
 drivers/iommu/intel/iommu.c|  89 ---
 drivers/iommu/iova.c   | 200 -
 include/linux/iommu.h  |   3 +-
 include/linux/iova.h   |  69 +
 6 files changed, 297 insertions(+), 450 deletions(-)

-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 05/11] iommu/iova: Squash flush_cb abstraction

2021-12-17 Thread Robin Murphy

On 2021-12-16 08:02, Christoph Hellwig wrote:

@@ -147,7 +142,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, 
unsigned long pfn_lo,
unsigned long pfn_hi);
  void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
unsigned long start_pfn);
-int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb);
+int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain);


Overly long line here.


Fear not, it disappears again in the final patch. In cases like this I 
much prefer to minimise the amount of churn within the series, to keep 
the patches focused on the meaningful changes.



Otherwise looks good:

Reviewed-by: Christoph Hellwig 


Thanks for the reviews!

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] drm/tegra: Add missing DMA API includes

2021-12-16 Thread Robin Murphy
Include dma-mapping.h directly in the remaining places that touch the
DMA API, to avoid imminent breakage from refactoring other headers.

Signed-off-by: Robin Murphy 
---

This makes arm64 allmodconfig build cleanly for me with the IOVA series,
so hopefully is the last of it...

 drivers/gpu/drm/tegra/dc.c| 1 +
 drivers/gpu/drm/tegra/hub.c   | 1 +
 drivers/gpu/drm/tegra/plane.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index a29d64f87563..c33420e1fb07 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c
index b910155f80c4..7d52a403334b 100644
--- a/drivers/gpu/drm/tegra/hub.c
+++ b/drivers/gpu/drm/tegra/hub.c
@@ -5,6 +5,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c
index 16a1cdc28657..75db069dd26f 100644
--- a/drivers/gpu/drm/tegra/plane.c
+++ b/drivers/gpu/drm/tegra/plane.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2017 NVIDIA CORPORATION.  All rights reserved.
  */
 
+#include 
 #include 
 #include 
 
-- 
2.28.0.dirty

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] drm/tegra: Add missing DMA API includes

2021-12-16 Thread Robin Murphy
Include dma-mapping.h directly in the remaining places that touch the
DMA API, to avoid imminent breakage from refactoring other headers.

Signed-off-by: Robin Murphy 
---

This makes arm64 allmodconfig build cleanly for me with the IOVA series,
so hopefully is the last of it...

 drivers/gpu/drm/tegra/dc.c| 1 +
 drivers/gpu/drm/tegra/hub.c   | 1 +
 drivers/gpu/drm/tegra/plane.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index a29d64f87563..c33420e1fb07 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c
index b910155f80c4..7d52a403334b 100644
--- a/drivers/gpu/drm/tegra/hub.c
+++ b/drivers/gpu/drm/tegra/hub.c
@@ -5,6 +5,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c
index 16a1cdc28657..75db069dd26f 100644
--- a/drivers/gpu/drm/tegra/plane.c
+++ b/drivers/gpu/drm/tegra/plane.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2017 NVIDIA CORPORATION.  All rights reserved.
  */
 
+#include 
 #include 
 #include 
 
-- 
2.28.0.dirty



Re: [PATCH 2/2] doc: add Arm Juno board documentation

2021-12-15 Thread Robin Murphy

Hi Andre,

On 2021-12-14 17:55, Andre Przywara wrote:

The Juno Arm development board is an open, vendor-neutral, Armv8-A
development platform.
Add documentation that briefly outlines the hardware, and describes
building and installation of U-Boot.

Signed-off-by: Andre Przywara 
---
  doc/board/armltd/index.rst |   1 +
  doc/board/armltd/juno.rst  | 117 +
  2 files changed, 118 insertions(+)
  create mode 100644 doc/board/armltd/juno.rst

diff --git a/doc/board/armltd/index.rst b/doc/board/armltd/index.rst
index caa6fd2bb0..68d938c647 100644
--- a/doc/board/armltd/index.rst
+++ b/doc/board/armltd/index.rst
@@ -8,3 +8,4 @@ ARM Ltd. boards and emulated systems
 :maxdepth: 2
  
 fvp64

+   juno
diff --git a/doc/board/armltd/juno.rst b/doc/board/armltd/juno.rst
new file mode 100644
index 00..f37bc2c78e
--- /dev/null
+++ b/doc/board/armltd/juno.rst
@@ -0,0 +1,117 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2021 Arm Ltd.
+
+Arm Juno board
+==
+
+The `Juno development board`_ is an open, vendor-neutral, Armv8-A


Nit: I believe its official name (or at least the closest thing to one) 
is "Juno ARM Development Platform", although that's admittedly not how 
the hardware product page being cross-referenced here is titled. Maybe 
that's just one for the commit message.



+development platform, made by Arm Ltd. It is based on the former Versatile


Arguably not so much "based on" as "part of" ;)


+Express series.
+There are three revisions of the board:
+
+* Juno r0, with two Cortex-A57 and four Cortex-A53 cores, without PCIe.
+* Juno r1, with two Cortex-A57 and four Cortex-A53 cores, in later silicon
+  revisions, and with PCIe slots, Gigabit Ethernet and two SATA ports.
+* Juno r2, with two Cortex-A72 and four Cortex-A53 cores, otherwise the
+  same as r1.
+
+Among other things, the motherboard contains a management controller (MCP),


Note that "MCP" is a standard name for something else not relevant to 
Juno - the correct name for this thing is "MCC" (Motherboard 
Configuration Controller).



+an FPGA providing I/O interfaces (IOFPGA) and 64MB of NOR flash. The provided
+platform devices resemble the VExpress peripherals.
+The actual SoC also contains a Cortex-M3 based System Control Processor (SCP).
+
+U-Boot build
+
+There is only one defconfig and one binary build that covers all three board
+revisions, so to generate the needed ``u-boot.bin``:
+
+.. code-block:: bash
+
+$ make vexpress_aemv8a_juno_defconfig
+$ make
+
+The automatic distro boot sequence looks for UEFI boot applications and
+``boot.scr`` scripts on various boot media, starting with USB, then on disks
+connected to the two SATA ports, PXE, DHCP and eventually on the NOR flash.
+
+U-Boot installation
+---
+This assumes there is some firmware on the SD card or NOR flash (see below
+for more details). The U-Boot binary is included in the Trusted Firmware
+FIP image, so after building U-Boot, this needs to be repackaged or recompiled.
+
+The NOR flash will be updated by the MCP, based on the content of a micro-SD
+card, which will be exported as a USB mass storage device via the rear USB-B
+socket. So to access that SD card, connect a USB-A->USB-B cable between some
+host computer and the board, and mount the FAT partition on the UMS device.


This sentence feels a little prejudiced against those of us with only 
USB-C host ports :P


If you want to get into this much detail it could be worth noting that 
it's also viable (and often faster with large images) to power off the 
board completely and pop the card out from behind the front panel. But 
for a simple guide I reckon the mere mention of the USB-B socket and 
FAT16-formatted card implies enough already.



+If there is no device, check the upper serial port for a prompt, and
+explicitly enable the USB interface::
+
+Cmd> usb_on
+Enabling debug USB...
+
+Repackaging an existing FIP image
+^
+To prevent problems, it is probably a good idea to backup the existing 
firmware,
+for instance by just copying the entire SOFTWARE directory beforehand.
+
+To just repackage with an updated U-Boot, first extract the current FIP image:


There should be a `fiptool update` command that allows replacing one or 
more components in a FIP image directly, which is probably less daunting 
for the casual user.



+
+.. code-block:: bash
+
+$ mkdir /tmp/juno; cd /tmp/juno
+$ fiptool unpack /mnt/juno/SOFTWARE/fip.bin
+
+Then, re-assemble the FIP image, replacing the "``nt-fw``" component with
+your newly compiled ``u-boot.bin``. To find the right command line, look at the
+output of "``fiptool info``", then use the given command line option for each
+file:
+
+.. code-block:: bash
+
+$ fiptool info /mnt/juno/SOFTWARE/fip.bin
+$ fiptool create --scp-fw scp-fw.bin --soc-fw soc-fw.bin \
+  --hw-config hw-config.bin ... --nt-fw 

Re: [PATCH v2 10/11] iommu/iova: Move flush queue code to iommu-dma

2021-12-14 Thread Robin Murphy

On 2021-12-14 17:18, John Garry via iommu wrote:

On 10/12/2021 17:54, Robin Murphy wrote:

+    iovad->fq_domain = fq_domain;
+    iovad->fq = queue;
+
+    timer_setup(>fq_timer, fq_flush_timeout, 0);
+    atomic_set(>fq_timer_on, 0);
+
+    return 0;
+}
+
+


nit: a single blank line is standard, I think


Hmm, you're right - I've grown fond of leaving an extra little bit of 
breathing space between logically-independent sections of code, and for 
some reason I thought this file was already in that style, but indeed it 
isn't.


Joerg - let me know if you feel strongly enough that you'd like me to 
change that. I'm going to have one last go at fixing tegra-drm, so I'm 
happy to send a v3 of the whole series later this week if there are any 
other minor tweaks too.


Thanks for all the reviews!

Robin.



Cheers

  static inline size_t cookie_msi_granule(struct iommu_dma_cookie 
*cookie)

  {



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v2 0/8] Host1x context isolation support

2021-12-14 Thread Robin Murphy

On 2021-11-08 10:36, Mikko Perttunen wrote:

On 9/16/21 5:32 PM, Mikko Perttunen wrote:

Hi all,

***
New in v2:

Added support for Tegra194
Use standard iommu-map property instead of custom mechanism
***

this series adds support for Host1x 'context isolation'. Since
when programming engines through Host1x, userspace can program in
any addresses it wants, we need some way to isolate the engines'
memory spaces. Traditionally this has either been done imperfectly
with a single shared IOMMU domain, or by copying and verifying the
programming command stream at submit time (Host1x firewall).

Since Tegra186 there is a privileged (only usable by kernel)
Host1x opcode that allows setting the stream ID sent by the engine
to the SMMU. So, by allocating a number of context banks and stream
IDs for this purpose, and using this opcode at the beginning of
each job, we can implement isolation. Due to the limited number of
context banks only each process gets its own context, and not
each channel.

This feature also allows sharing engines among multiple VMs when
used with Host1x's hardware virtualization support - up to 8 VMs
can be configured with a subset of allowed stream IDs, enforced
at hardware level.

To implement this, this series adds a new host1x context bus, which
will contain the 'struct device's corresponding to each context
bank / stream ID, changes to device tree and SMMU code to allow
registering the devices and using the bus, as well as the Host1x
stream ID programming code and support in TegraDRM.

Device tree bindings are not updated yet pending consensus that the
proposed changes make sense.

Thanks,
Mikko

Mikko Perttunen (8):
   gpu: host1x: Add context bus
   gpu: host1x: Add context device management code
   gpu: host1x: Program context stream ID on submission
   iommu/arm-smmu: Attach to host1x context device bus
   arm64: tegra: Add Host1x context stream IDs on Tegra186+
   drm/tegra: falcon: Set DMACTX field on DMA transactions
   drm/tegra: vic: Implement get_streamid_offset
   drm/tegra: Support context isolation

  arch/arm64/boot/dts/nvidia/tegra186.dtsi  |  12 ++
  arch/arm64/boot/dts/nvidia/tegra194.dtsi  |  12 ++
  drivers/gpu/Makefile  |   3 +-
  drivers/gpu/drm/tegra/drm.h   |   2 +
  drivers/gpu/drm/tegra/falcon.c    |   8 +
  drivers/gpu/drm/tegra/falcon.h    |   1 +
  drivers/gpu/drm/tegra/submit.c    |  13 ++
  drivers/gpu/drm/tegra/uapi.c  |  34 -
  drivers/gpu/drm/tegra/vic.c   |  38 +
  drivers/gpu/host1x/Kconfig    |   5 +
  drivers/gpu/host1x/Makefile   |   2 +
  drivers/gpu/host1x/context.c  | 174 ++
  drivers/gpu/host1x/context.h  |  27 
  drivers/gpu/host1x/context_bus.c  |  31 
  drivers/gpu/host1x/dev.c  |  12 +-
  drivers/gpu/host1x/dev.h  |   2 +
  drivers/gpu/host1x/hw/channel_hw.c    |  52 ++-
  drivers/gpu/host1x/hw/host1x06_hardware.h |  10 ++
  drivers/gpu/host1x/hw/host1x07_hardware.h |  10 ++
  drivers/iommu/arm/arm-smmu/arm-smmu.c |  13 ++
  include/linux/host1x.h    |  21 +++
  include/linux/host1x_context_bus.h    |  15 ++
  22 files changed, 488 insertions(+), 9 deletions(-)
  create mode 100644 drivers/gpu/host1x/context.c
  create mode 100644 drivers/gpu/host1x/context.h
  create mode 100644 drivers/gpu/host1x/context_bus.c
  create mode 100644 include/linux/host1x_context_bus.h



IOMMU/DT folks, any thoughts about this approach? The patches that are 
of interest outside of Host1x/TegraDRM specifics are patches 1, 2, 4, 
and 5.


FWIW it looks fairly innocuous to me. I don't understand host1x - 
neither hardware nor driver abstractions - well enough to meaningfully 
review it all (e.g. maybe it's deliberate that the bus .dma_configure 
method isn't used?), but the SMMU patch seems fine given the Kconfig 
solution to avoid module linkage problems.


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v2 0/8] Host1x context isolation support

2021-12-14 Thread Robin Murphy

On 2021-11-08 10:36, Mikko Perttunen wrote:

On 9/16/21 5:32 PM, Mikko Perttunen wrote:

Hi all,

***
New in v2:

Added support for Tegra194
Use standard iommu-map property instead of custom mechanism
***

this series adds support for Host1x 'context isolation'. Since
when programming engines through Host1x, userspace can program in
any addresses it wants, we need some way to isolate the engines'
memory spaces. Traditionally this has either been done imperfectly
with a single shared IOMMU domain, or by copying and verifying the
programming command stream at submit time (Host1x firewall).

Since Tegra186 there is a privileged (only usable by kernel)
Host1x opcode that allows setting the stream ID sent by the engine
to the SMMU. So, by allocating a number of context banks and stream
IDs for this purpose, and using this opcode at the beginning of
each job, we can implement isolation. Due to the limited number of
context banks only each process gets its own context, and not
each channel.

This feature also allows sharing engines among multiple VMs when
used with Host1x's hardware virtualization support - up to 8 VMs
can be configured with a subset of allowed stream IDs, enforced
at hardware level.

To implement this, this series adds a new host1x context bus, which
will contain the 'struct device's corresponding to each context
bank / stream ID, changes to device tree and SMMU code to allow
registering the devices and using the bus, as well as the Host1x
stream ID programming code and support in TegraDRM.

Device tree bindings are not updated yet pending consensus that the
proposed changes make sense.

Thanks,
Mikko

Mikko Perttunen (8):
   gpu: host1x: Add context bus
   gpu: host1x: Add context device management code
   gpu: host1x: Program context stream ID on submission
   iommu/arm-smmu: Attach to host1x context device bus
   arm64: tegra: Add Host1x context stream IDs on Tegra186+
   drm/tegra: falcon: Set DMACTX field on DMA transactions
   drm/tegra: vic: Implement get_streamid_offset
   drm/tegra: Support context isolation

  arch/arm64/boot/dts/nvidia/tegra186.dtsi  |  12 ++
  arch/arm64/boot/dts/nvidia/tegra194.dtsi  |  12 ++
  drivers/gpu/Makefile  |   3 +-
  drivers/gpu/drm/tegra/drm.h   |   2 +
  drivers/gpu/drm/tegra/falcon.c    |   8 +
  drivers/gpu/drm/tegra/falcon.h    |   1 +
  drivers/gpu/drm/tegra/submit.c    |  13 ++
  drivers/gpu/drm/tegra/uapi.c  |  34 -
  drivers/gpu/drm/tegra/vic.c   |  38 +
  drivers/gpu/host1x/Kconfig    |   5 +
  drivers/gpu/host1x/Makefile   |   2 +
  drivers/gpu/host1x/context.c  | 174 ++
  drivers/gpu/host1x/context.h  |  27 
  drivers/gpu/host1x/context_bus.c  |  31 
  drivers/gpu/host1x/dev.c  |  12 +-
  drivers/gpu/host1x/dev.h  |   2 +
  drivers/gpu/host1x/hw/channel_hw.c    |  52 ++-
  drivers/gpu/host1x/hw/host1x06_hardware.h |  10 ++
  drivers/gpu/host1x/hw/host1x07_hardware.h |  10 ++
  drivers/iommu/arm/arm-smmu/arm-smmu.c |  13 ++
  include/linux/host1x.h    |  21 +++
  include/linux/host1x_context_bus.h    |  15 ++
  22 files changed, 488 insertions(+), 9 deletions(-)
  create mode 100644 drivers/gpu/host1x/context.c
  create mode 100644 drivers/gpu/host1x/context.h
  create mode 100644 drivers/gpu/host1x/context_bus.c
  create mode 100644 include/linux/host1x_context_bus.h



IOMMU/DT folks, any thoughts about this approach? The patches that are 
of interest outside of Host1x/TegraDRM specifics are patches 1, 2, 4, 
and 5.


FWIW it looks fairly innocuous to me. I don't understand host1x - 
neither hardware nor driver abstractions - well enough to meaningfully 
review it all (e.g. maybe it's deliberate that the bus .dma_configure 
method isn't used?), but the SMMU patch seems fine given the Kconfig 
solution to avoid module linkage problems.


Cheers,
Robin.


Re: [PATCH] Revert "iommu/arm-smmu-v3: Decrease the queue size of evtq and priq"

2021-12-14 Thread Robin Murphy

On 2021-12-14 14:48, Will Deacon wrote:

On Tue, Dec 07, 2021 at 02:32:48PM +0800, Zhou Wang wrote:

The commit f115f3c0d5d8 ("iommu/arm-smmu-v3: Decrease the queue size of
evtq and priq") decreases evtq and priq, which may lead evtq/priq to be
full with fault events, e.g HiSilicon ZIP/SEC/HPRE have maximum 1024 queues
in one device, every queue could be binded with one process and trigger a
fault event. So let's revert f115f3c0d5d8.

In fact, if an implementation of SMMU really does not need so long evtq
and priq, value of IDR1_EVTQS and IDR1_PRIQS can be set to proper ones.

Signed-off-by: Zhou Wang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)


I'd like an Ack from Zhen Lei on this, as the aim of the original patch
was to reduce memory consumption.


I wonder if it's time to start trying to be a bit cleverer than just 
allocating the smallest/largest possible queues, and try to scale them 
to some heuristic of the "size" of the system? Certainly a module 
parameter so that users can manually tune for their system/workload 
might be a reasonable compromise.


I'm also tempted by the idea of trying to dynamically reallocate larger 
queues if they fill up, but I guess that depends on whether it's already 
game over if events or PRI requests are lost...


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 01/11] iommu/iova: Fix race between FQ timeout and teardown

2021-12-10 Thread Robin Murphy

On 2021-12-10 18:04, John Garry via iommu wrote:

On 10/12/2021 17:54, Robin Murphy wrote:

From: Xiongfeng Wang

It turns out to be possible for hotplugging out a device to reach the
stage of tearing down the device's group and default domain before the
domain's flush queue has drained naturally. At this point, it is then
possible for the timeout to expire just*before*  the del_timer() call


super nit: "just*before*  the" - needs a whitespace before "before" :)


Weird... the original patch file here and the copy received by lore via 
linux-iommu look fine, gremlins in your MUA or delivery path perhaps?



from free_iova_flush_queue(), such that we then proceed to free the FQ
resources while fq_flush_timeout() is still accessing them on another
CPU. Crashes due to this have been observed in the wild while removing
NVMe devices.

Close the race window by using del_timer_sync() to safely wait for any
active timeout handler to finish before we start to free things. We
already avoid any locking in free_iova_flush_queue() since the FQ is
supposed to be inactive anyway, so the potential deadlock scenario does
not apply.

Fixes: 9a005a800ae8 ("iommu/iova: Add flush timer")
Signed-off-by: Xiongfeng Wang
[ rm: rewrite commit message ]
Signed-off-by: Robin Murphy


FWIW,

Reviewed-by: John Garry 


Thanks John!

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH v2 11/11] iommu: Move flush queue data into iommu_dma_cookie

2021-12-10 Thread Robin Murphy
Complete the move into iommu-dma by refactoring the flush queues
themselves to belong to the DMA cookie rather than the IOVA domain.

The refactoring may as well extend to some minor cosmetic aspects
too, to help us stay one step ahead of the style police.

Signed-off-by: Robin Murphy 
---

v2: Rebase with del_timer_sync() change

 drivers/iommu/dma-iommu.c | 171 +-
 drivers/iommu/iova.c  |   2 -
 include/linux/iova.h  |  44 +-
 3 files changed, 95 insertions(+), 122 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ab8818965b2f..a7cd3a875481 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -9,9 +9,12 @@
  */
 
 #include 
+#include 
+#include 
 #include 
-#include 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -20,11 +23,10 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
+#include 
 #include 
-#include 
-#include 
 
 struct iommu_dma_msi_page {
struct list_headlist;
@@ -41,7 +43,19 @@ struct iommu_dma_cookie {
enum iommu_dma_cookie_type  type;
union {
/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
-   struct iova_domain  iovad;
+   struct {
+   struct iova_domain  iovad;
+
+   struct iova_fq __percpu *fq;/* Flush queue */
+   /* Number of TLB flushes that have been started */
+   atomic64_t  fq_flush_start_cnt;
+   /* Number of TLB flushes that have been finished */
+   atomic64_t  fq_flush_finish_cnt;
+   /* Timer to regularily empty the flush queues */
+   struct timer_list   fq_timer;
+   /* 1 when timer is active, 0 when not */
+   atomic_tfq_timer_on;
+   };
/* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
dma_addr_t  msi_iova;
};
@@ -65,6 +79,27 @@ static int __init iommu_dma_forcedac_setup(char *str)
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
 
+/* Number of entries per flush queue */
+#define IOVA_FQ_SIZE   256
+
+/* Timeout (in ms) after which entries are flushed from the queue */
+#define IOVA_FQ_TIMEOUT10
+
+/* Flush queue entry for deferred flushing */
+struct iova_fq_entry {
+   unsigned long iova_pfn;
+   unsigned long pages;
+   struct list_head freelist;
+   u64 counter; /* Flush counter when this entry was added */
+};
+
+/* Per-CPU flush queue structure */
+struct iova_fq {
+   struct iova_fq_entry entries[IOVA_FQ_SIZE];
+   unsigned int head, tail;
+   spinlock_t lock;
+};
+
 #define fq_ring_for_each(i, fq) \
for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % 
IOVA_FQ_SIZE)
 
@@ -74,9 +109,9 @@ static inline bool fq_full(struct iova_fq *fq)
return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
 }
 
-static inline unsigned fq_ring_add(struct iova_fq *fq)
+static inline unsigned int fq_ring_add(struct iova_fq *fq)
 {
-   unsigned idx = fq->tail;
+   unsigned int idx = fq->tail;
 
assert_spin_locked(>lock);
 
@@ -85,10 +120,10 @@ static inline unsigned fq_ring_add(struct iova_fq *fq)
return idx;
 }
 
-static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
 {
-   u64 counter = atomic64_read(>fq_flush_finish_cnt);
-   unsigned idx;
+   u64 counter = atomic64_read(>fq_flush_finish_cnt);
+   unsigned int idx;
 
assert_spin_locked(>lock);
 
@@ -98,7 +133,7 @@ static void fq_ring_free(struct iova_domain *iovad, struct 
iova_fq *fq)
break;
 
put_pages_list(>entries[idx].freelist);
-   free_iova_fast(iovad,
+   free_iova_fast(>iovad,
   fq->entries[idx].iova_pfn,
   fq->entries[idx].pages);
 
@@ -106,50 +141,50 @@ static void fq_ring_free(struct iova_domain *iovad, 
struct iova_fq *fq)
}
 }
 
-static void iova_domain_flush(struct iova_domain *iovad)
+static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
 {
-   atomic64_inc(>fq_flush_start_cnt);
-   iovad->fq_domain->ops->flush_iotlb_all(iovad->fq_domain);
-   atomic64_inc(>fq_flush_finish_cnt);
+   atomic64_inc(>fq_flush_start_cnt);
+   cookie->fq_domain->ops->flush_iotlb_all(cookie->fq_domain);
+   atomic64_inc(>fq_flush_finish_cnt);
 }
 
 static void fq_flush_timeout(struct timer_list *t)
 {
-   struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+   struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);

[PATCH v2 10/11] iommu/iova: Move flush queue code to iommu-dma

2021-12-10 Thread Robin Murphy
Flush queues are specific to DMA ops, which are now handled exclusively
by iommu-dma. As such, now that the historical artefacts from being
shared directly with drivers have been cleaned up, move the flush queue
code into iommu-dma itself to get it out of the way of other IOVA users.

This is pure code movement with no functional change; refactoring to
clean up the headers and definitions will follow.

Signed-off-by: Robin Murphy 
---

v2: Rebase with del_timer_sync() change

 drivers/iommu/dma-iommu.c | 179 +-
 drivers/iommu/iova.c  | 175 -
 2 files changed, 178 insertions(+), 176 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f139b77caee0..ab8818965b2f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -64,6 +64,181 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
+
+#define fq_ring_for_each(i, fq) \
+   for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % 
IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+   assert_spin_locked(>lock);
+   return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+   unsigned idx = fq->tail;
+
+   assert_spin_locked(>lock);
+
+   fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+   return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+   u64 counter = atomic64_read(>fq_flush_finish_cnt);
+   unsigned idx;
+
+   assert_spin_locked(>lock);
+
+   fq_ring_for_each(idx, fq) {
+
+   if (fq->entries[idx].counter >= counter)
+   break;
+
+   put_pages_list(>entries[idx].freelist);
+   free_iova_fast(iovad,
+  fq->entries[idx].iova_pfn,
+  fq->entries[idx].pages);
+
+   fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
+   }
+}
+
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+   atomic64_inc(>fq_flush_start_cnt);
+   iovad->fq_domain->ops->flush_iotlb_all(iovad->fq_domain);
+   atomic64_inc(>fq_flush_finish_cnt);
+}
+
+static void fq_flush_timeout(struct timer_list *t)
+{
+   struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
+   int cpu;
+
+   atomic_set(>fq_timer_on, 0);
+   iova_domain_flush(iovad);
+
+   for_each_possible_cpu(cpu) {
+   unsigned long flags;
+   struct iova_fq *fq;
+
+   fq = per_cpu_ptr(iovad->fq, cpu);
+   spin_lock_irqsave(>lock, flags);
+   fq_ring_free(iovad, fq);
+   spin_unlock_irqrestore(>lock, flags);
+   }
+}
+
+void queue_iova(struct iova_domain *iovad,
+   unsigned long pfn, unsigned long pages,
+   struct list_head *freelist)
+{
+   struct iova_fq *fq;
+   unsigned long flags;
+   unsigned idx;
+
+   /*
+* Order against the IOMMU driver's pagetable update from unmapping
+* @pte, to guarantee that iova_domain_flush() observes that if called
+* from a different CPU before we release the lock below. Full barrier
+* so it also pairs with iommu_dma_init_fq() to avoid seeing partially
+* written fq state here.
+*/
+   smp_mb();
+
+   fq = raw_cpu_ptr(iovad->fq);
+   spin_lock_irqsave(>lock, flags);
+
+   /*
+* First remove all entries from the flush queue that have already been
+* flushed out on another CPU. This makes the fq_full() check below less
+* likely to be true.
+*/
+   fq_ring_free(iovad, fq);
+
+   if (fq_full(fq)) {
+   iova_domain_flush(iovad);
+   fq_ring_free(iovad, fq);
+   }
+
+   idx = fq_ring_add(fq);
+
+   fq->entries[idx].iova_pfn = pfn;
+   fq->entries[idx].pages= pages;
+   fq->entries[idx].counter  = atomic64_read(>fq_flush_start_cnt);
+   list_splice(freelist, >entries[idx].freelist);
+
+   spin_unlock_irqrestore(>lock, flags);
+
+   /* Avoid false sharing as much as possible. */
+   if (!atomic_read(>fq_timer_on) &&
+   !atomic_xchg(>fq_timer_on, 1))
+   mod_timer(>fq_timer,
+ jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+}
+
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+   int cpu, idx;
+
+   if (!iovad->fq)
+   return;
+
+   del_timer_sync(>fq_timer);
+   /*
+* This code runs when the iova_domain is being detroyed, so don't
+* bother to free iovas, just free any remaining pagetable pages.
+*/
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *f

[PATCH v2 09/11] iommu/iova: Consolidate flush queue code

2021-12-10 Thread Robin Murphy
Squash and simplify some of the freeing code, and move the init
and free routines down into the rest of the flush queue code to
obviate the forward declarations.

Signed-off-by: Robin Murphy 
---

v2: Rebase with del_timer_sync() change

 drivers/iommu/iova.c | 131 +++
 1 file changed, 58 insertions(+), 73 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a0250cebcdcf..c19f9a749070 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,8 +24,6 @@ static unsigned long iova_rcache_get(struct iova_domain 
*iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
-static void fq_destroy_all_entries(struct iova_domain *iovad);
-static void fq_flush_timeout(struct timer_list *t);
 
 static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
 {
@@ -73,60 +71,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
-static bool has_iova_flush_queue(struct iova_domain *iovad)
-{
-   return !!iovad->fq;
-}
-
-static void free_iova_flush_queue(struct iova_domain *iovad)
-{
-   if (!has_iova_flush_queue(iovad))
-   return;
-
-   del_timer_sync(>fq_timer);
-
-   fq_destroy_all_entries(iovad);
-
-   free_percpu(iovad->fq);
-
-   iovad->fq = NULL;
-   iovad->fq_domain  = NULL;
-}
-
-int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain)
-{
-   struct iova_fq __percpu *queue;
-   int i, cpu;
-
-   atomic64_set(>fq_flush_start_cnt,  0);
-   atomic64_set(>fq_flush_finish_cnt, 0);
-
-   queue = alloc_percpu(struct iova_fq);
-   if (!queue)
-   return -ENOMEM;
-
-   for_each_possible_cpu(cpu) {
-   struct iova_fq *fq;
-
-   fq = per_cpu_ptr(queue, cpu);
-   fq->head = 0;
-   fq->tail = 0;
-
-   spin_lock_init(>lock);
-
-   for (i = 0; i < IOVA_FQ_SIZE; i++)
-   INIT_LIST_HEAD(>entries[i].freelist);
-   }
-
-   iovad->fq_domain = fq_domain;
-   iovad->fq = queue;
-
-   timer_setup(>fq_timer, fq_flush_timeout, 0);
-   atomic_set(>fq_timer_on, 0);
-
-   return 0;
-}
-
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
 {
@@ -585,23 +529,6 @@ static void iova_domain_flush(struct iova_domain *iovad)
atomic64_inc(>fq_flush_finish_cnt);
 }
 
-static void fq_destroy_all_entries(struct iova_domain *iovad)
-{
-   int cpu;
-
-   /*
-* This code runs when the iova_domain is being detroyed, so don't
-* bother to free iovas, just free any remaining pagetable pages.
-*/
-   for_each_possible_cpu(cpu) {
-   struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
-   int idx;
-
-   fq_ring_for_each(idx, fq)
-   put_pages_list(>entries[idx].freelist);
-   }
-}
-
 static void fq_flush_timeout(struct timer_list *t)
 {
struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
@@ -669,6 +596,64 @@ void queue_iova(struct iova_domain *iovad,
  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
 
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+   int cpu, idx;
+
+   if (!iovad->fq)
+   return;
+
+   del_timer_sync(>fq_timer);
+   /*
+* This code runs when the iova_domain is being detroyed, so don't
+* bother to free iovas, just free any remaining pagetable pages.
+*/
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+
+   fq_ring_for_each(idx, fq)
+   put_pages_list(>entries[idx].freelist);
+   }
+
+   free_percpu(iovad->fq);
+
+   iovad->fq = NULL;
+   iovad->fq_domain = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad, struct iommu_domain 
*fq_domain)
+{
+   struct iova_fq __percpu *queue;
+   int i, cpu;
+
+   atomic64_set(>fq_flush_start_cnt,  0);
+   atomic64_set(>fq_flush_finish_cnt, 0);
+
+   queue = alloc_percpu(struct iova_fq);
+   if (!queue)
+   return -ENOMEM;
+
+   for_each_possible_cpu(cpu) {
+   struct iova_fq *fq = per_cpu_ptr(queue, cpu);
+
+   fq->head = 0;
+   fq->tail = 0;
+
+   spin_lock_init(>lock);
+
+   for (i = 0; i < IOVA_FQ_SIZE; i++)
+   INIT_LIST_HEAD(>entries[i].freelist);
+   }
+
+   iovad->fq_domain = fq_domain;
+   iovad->fq = queue;
+
+   timer_setup(>fq_timer, fq_flush_timeout, 0);
+

[PATCH v2 08/11] iommu/vt-d: Use put_pages_list

2021-12-10 Thread Robin Murphy
From: "Matthew Wilcox (Oracle)" 

page->freelist is for the use of slab.  We already have the ability
to free a list of pages in the core mm, but it requires the use of a
list_head and for the pages to be chained together through page->lru.
Switch the Intel IOMMU and IOVA code over to using free_pages_list().

Signed-off-by: Matthew Wilcox (Oracle) 
[rm: split from original patch, cosmetic tweaks, fix fq entries]
Signed-off-by: Robin Murphy 
---

v2: No change

 drivers/iommu/dma-iommu.c   |  2 +-
 drivers/iommu/intel/iommu.c | 89 +
 drivers/iommu/iova.c| 26 ---
 include/linux/iommu.h   |  3 +-
 include/linux/iova.h|  4 +-
 5 files changed, 45 insertions(+), 79 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index cde887530549..f139b77caee0 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -455,7 +455,7 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie 
*cookie,
else if (gather && gather->queued)
queue_iova(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad),
-   gather->freelist);
+   >freelist);
else
free_iova_fast(iovad, iova_pfn(iovad, iova),
size >> iova_shift(iovad));
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b6a8f3282411..17b3d97111f3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1303,35 +1303,30 @@ static void dma_pte_free_pagetable(struct dmar_domain 
*domain,
know the hardware page-walk will no longer touch them.
The 'pte' argument is the *parent* PTE, pointing to the page that is to
be freed. */
-static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
-   int level, struct dma_pte *pte,
-   struct page *freelist)
+static void dma_pte_list_pagetables(struct dmar_domain *domain,
+   int level, struct dma_pte *pte,
+   struct list_head *freelist)
 {
struct page *pg;
 
pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
-   pg->freelist = freelist;
-   freelist = pg;
+   list_add_tail(>lru, freelist);
 
if (level == 1)
-   return freelist;
+   return;
 
pte = page_address(pg);
do {
if (dma_pte_present(pte) && !dma_pte_superpage(pte))
-   freelist = dma_pte_list_pagetables(domain, level - 1,
-  pte, freelist);
+   dma_pte_list_pagetables(domain, level - 1, pte, 
freelist);
pte++;
} while (!first_pte_in_page(pte));
-
-   return freelist;
 }
 
-static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
-   struct dma_pte *pte, unsigned long pfn,
-   unsigned long start_pfn,
-   unsigned long last_pfn,
-   struct page *freelist)
+static void dma_pte_clear_level(struct dmar_domain *domain, int level,
+   struct dma_pte *pte, unsigned long pfn,
+   unsigned long start_pfn, unsigned long last_pfn,
+   struct list_head *freelist)
 {
struct dma_pte *first_pte = NULL, *last_pte = NULL;
 
@@ -1350,7 +1345,7 @@ static struct page *dma_pte_clear_level(struct 
dmar_domain *domain, int level,
/* These suborbinate page tables are going away 
entirely. Don't
   bother to clear them; we're just going to *free* 
them. */
if (level > 1 && !dma_pte_superpage(pte))
-   freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+   dma_pte_list_pagetables(domain, level - 1, pte, 
freelist);
 
dma_clear_pte(pte);
if (!first_pte)
@@ -1358,10 +1353,10 @@ static struct page *dma_pte_clear_level(struct 
dmar_domain *domain, int level,
last_pte = pte;
} else if (level > 1) {
/* Recurse down into a level that isn't *entirely* 
obsolete */
-   freelist = dma_pte_clear_level(domain, level - 1,
-  
phys_to_virt(dma_pte_addr(pte)),
-  level_pfn, start_pfn, 
last_pfn,
-  freelist);
+   dma_pte_clear_level(domain, level

<    2   3   4   5   6   7   8   9   10   11   >