[PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Pasha Tatashin
From: Pasha Tatashin 

IOMMU subsystem may contain state that is in gigabytes. Majority of that
state is iommu page tables. Yet, there is currently, no way to observe
how much memory is actually used by the iommu subsystem.

This patch series solves this problem by adding both observability to
all pages that are allocated by IOMMU, and also accountability, so
admins can limit the amount if via cgroups.

The system-wide observability is using /proc/meminfo:
SecPageTables:438176 kB

Contains IOMMU and KVM memory.

Per-node observability:
/sys/devices/system/node/nodeN/meminfo
Node N SecPageTables:422204 kB

Contains IOMMU and KVM memory memory in the given NUMA node.

Per-node IOMMU only observability:
/sys/devices/system/node/nodeN/vmstat
nr_iommu_pages 10

Contains number of pages IOMMU allocated in the given node.

Accountability: using sec_pagetables cgroup-v2 memory.stat entry.

With the change, iova_stress[1] stops as limit is reached:

# ./iova_stress
iova space: 0T  free memory:   497G
iova space: 1T  free memory:   495G
iova space: 2T  free memory:   493G
iova space: 3T  free memory:   491G

stops as limit is reached.

This series encorporates suggestions that came from the discussion
at LPC [2].

[1] https://github.com/soleen/iova_stress
[2] https://lpc.events/event/17/contributions/1466

Pasha Tatashin (16):
  iommu/vt-d: add wrapper functions for page allocations
  iommu/amd: use page allocation function provided by iommu-pages.h
  iommu/io-pgtable-arm: use page allocation function provided by
iommu-pages.h
  iommu/io-pgtable-dart: use page allocation function provided by
iommu-pages.h
  iommu/io-pgtable-arm-v7s: use page allocation function provided by
iommu-pages.h
  iommu/dma: use page allocation function provided by iommu-pages.h
  iommu/exynos: use page allocation function provided by iommu-pages.h
  iommu/fsl: use page allocation function provided by iommu-pages.h
  iommu/iommufd: use page allocation function provided by iommu-pages.h
  iommu/rockchip: use page allocation function provided by iommu-pages.h
  iommu/sun50i: use page allocation function provided by iommu-pages.h
  iommu/tegra-smmu: use page allocation function provided by
iommu-pages.h
  iommu: observability of the IOMMU allocations
  iommu: account IOMMU allocated memory
  vhost-vdpa: account iommu allocations
  vfio: account iommu allocations

 Documentation/admin-guide/cgroup-v2.rst |   2 +-
 Documentation/filesystems/proc.rst  |   4 +-
 drivers/iommu/amd/amd_iommu.h   |   8 -
 drivers/iommu/amd/init.c|  91 +-
 drivers/iommu/amd/io_pgtable.c  |  13 +-
 drivers/iommu/amd/io_pgtable_v2.c   |  20 +-
 drivers/iommu/amd/iommu.c   |  13 +-
 drivers/iommu/dma-iommu.c   |   8 +-
 drivers/iommu/exynos-iommu.c|  14 +-
 drivers/iommu/fsl_pamu.c|   5 +-
 drivers/iommu/intel/dmar.c  |  10 +-
 drivers/iommu/intel/iommu.c |  47 ++---
 drivers/iommu/intel/iommu.h |   2 -
 drivers/iommu/intel/irq_remapping.c |  10 +-
 drivers/iommu/intel/pasid.c |  12 +-
 drivers/iommu/intel/svm.c   |   7 +-
 drivers/iommu/io-pgtable-arm-v7s.c  |   9 +-
 drivers/iommu/io-pgtable-arm.c  |   7 +-
 drivers/iommu/io-pgtable-dart.c |  37 ++--
 drivers/iommu/iommu-pages.h | 231 
 drivers/iommu/iommufd/iova_bitmap.c |   6 +-
 drivers/iommu/rockchip-iommu.c  |  14 +-
 drivers/iommu/sun50i-iommu.c|   7 +-
 drivers/iommu/tegra-smmu.c  |  18 +-
 drivers/vfio/vfio_iommu_type1.c |   8 +-
 drivers/vhost/vdpa.c|   3 +-
 include/linux/mmzone.h  |   5 +-
 mm/vmstat.c |   3 +
 28 files changed, 415 insertions(+), 199 deletions(-)
 create mode 100644 drivers/iommu/iommu-pages.h

-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 03/16] iommu/io-pgtable-arm: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/io-pgtable-arm.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/io-pgtable-arm.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 72dcdd468cf3..21d315151ad6 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -21,6 +21,7 @@
 #include 
 
 #include "io-pgtable-arm.h"
+#include "iommu-pages.h"
 
 #define ARM_LPAE_MAX_ADDR_BITS 52
 #define ARM_LPAE_S2_MAX_CONCAT_PAGES   16
@@ -197,7 +198,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
void *pages;
 
VM_BUG_ON((gfp & __GFP_HIGHMEM));
-   p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order);
+   p = __iommu_alloc_pages_node(dev_to_node(dev), gfp, order);
if (!p)
return NULL;
 
@@ -221,7 +222,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
dev_err(dev, "Cannot accommodate DMA translation for IOMMU page 
tables\n");
dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
 out_free:
-   __free_pages(p, order);
+   __iommu_free_pages(p, order);
return NULL;
 }
 
@@ -231,7 +232,7 @@ static void __arm_lpae_free_pages(void *pages, size_t size,
if (!cfg->coherent_walk)
dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
 size, DMA_TO_DEVICE);
-   free_pages((unsigned long)pages, get_order(size));
+   iommu_free_pages(pages, get_order(size));
 }
 
 static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 01/16] iommu/vt-d: add wrapper functions for page allocations

2023-11-28 Thread Pasha Tatashin
In order to improve observability and accountability of IOMMU layer, we
must account the number of pages that are allocated by functions that
are calling directly into buddy allocator.

This is achieved by first wrapping the allocation related functions into a
separate inline functions in new file:

drivers/iommu/iommu-pages.h

Convert all page allocation calls under iommu/intel to use these new
functions.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/intel/dmar.c  |  10 +-
 drivers/iommu/intel/iommu.c |  47 +++
 drivers/iommu/intel/iommu.h |   2 -
 drivers/iommu/intel/irq_remapping.c |  10 +-
 drivers/iommu/intel/pasid.c |  12 +-
 drivers/iommu/intel/svm.c   |   7 +-
 drivers/iommu/iommu-pages.h | 199 
 7 files changed, 236 insertions(+), 51 deletions(-)
 create mode 100644 drivers/iommu/iommu-pages.h

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index a3414afe11b0..a6937e1e20a5 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -32,6 +32,7 @@
 
 #include "iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "perf.h"
 #include "trace.h"
 #include "perfmon.h"
@@ -1185,7 +1186,7 @@ static void free_iommu(struct intel_iommu *iommu)
}
 
if (iommu->qi) {
-   free_page((unsigned long)iommu->qi->desc);
+   iommu_free_page(iommu->qi->desc);
kfree(iommu->qi->desc_status);
kfree(iommu->qi);
}
@@ -1714,6 +1715,7 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 {
struct q_inval *qi;
struct page *desc_page;
+   int order;
 
if (!ecap_qis(iommu->ecap))
return -ENOENT;
@@ -1734,8 +1736,8 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 * Need two pages to accommodate 256 descriptors of 256 bits each
 * if the remapping hardware supports scalable mode translation.
 */
-   desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-!!ecap_smts(iommu->ecap));
+   order = ecap_smts(iommu->ecap) ? 1 : 0;
+   desc_page = __iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order);
if (!desc_page) {
kfree(qi);
iommu->qi = NULL;
@@ -1746,7 +1748,7 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 
qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
if (!qi->desc_status) {
-   free_page((unsigned long) qi->desc);
+   iommu_free_page(qi->desc);
kfree(qi);
iommu->qi = NULL;
return -ENOMEM;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3531b956556c..04f852175cbe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -28,6 +28,7 @@
 #include "../dma-iommu.h"
 #include "../irq_remapping.h"
 #include "../iommu-sva.h"
+#include "../iommu-pages.h"
 #include "pasid.h"
 #include "cap_audit.h"
 #include "perfmon.h"
@@ -367,22 +368,6 @@ static int __init intel_iommu_setup(char *str)
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
-void *alloc_pgtable_page(int node, gfp_t gfp)
-{
-   struct page *page;
-   void *vaddr = NULL;
-
-   page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-   if (page)
-   vaddr = page_address(page);
-   return vaddr;
-}
-
-void free_pgtable_page(void *vaddr)
-{
-   free_page((unsigned long)vaddr);
-}
-
 static inline int domain_type_is_si(struct dmar_domain *domain)
 {
return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
@@ -617,7 +602,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu 
*iommu, u8 bus,
if (!alloc)
return NULL;
 
-   context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
+   context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
if (!context)
return NULL;
 
@@ -791,17 +776,17 @@ static void free_context_table(struct intel_iommu *iommu)
for (i = 0; i < ROOT_ENTRY_NR; i++) {
context = iommu_context_addr(iommu, i, 0, 0);
if (context)
-   free_pgtable_page(context);
+   iommu_free_page(context);
 
if (!sm_supported(iommu))
continue;
 
context = iommu_context_addr(iommu, i, 0x80, 0);
if (context)
-   free_pgtable_page(context);
+   iommu_free_page(context);
}
 
-   free_pgtable_page(iommu->root_entry);
+   iommu_free_page(iommu->root_entry);
iommu->root_entry = NULL;
 }
 
@@ -939,7 +924,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain 
*domain,
if (!dma_pte_present(pte)) {
uint64_t pteval;
 
-   tmp_page = all

[PATCH 02/16] iommu/amd: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/amd/* files to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/amd/amd_iommu.h |  8 ---
 drivers/iommu/amd/init.c  | 91 ++-
 drivers/iommu/amd/io_pgtable.c| 13 +++--
 drivers/iommu/amd/io_pgtable_v2.c | 20 +++
 drivers/iommu/amd/iommu.c | 13 +++--
 5 files changed, 64 insertions(+), 81 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 86be1edd50ee..bf697d566e0b 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -136,14 +136,6 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev)
return PCI_SEG_DEVID_TO_SBDF(seg, devid);
 }
 
-static inline void *alloc_pgtable_page(int nid, gfp_t gfp)
-{
-   struct page *page;
-
-   page = alloc_pages_node(nid, gfp | __GFP_ZERO, 0);
-   return page ? page_address(page) : NULL;
-}
-
 bool translation_pre_enabled(struct amd_iommu *iommu);
 bool amd_iommu_is_attach_deferred(struct device *dev);
 int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line);
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 64bcf3df37ee..5b8a80fc7e50 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -35,6 +35,7 @@
 
 #include "amd_iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 
 /*
  * definitions for the ACPI scanning code
@@ -648,8 +649,8 @@ static int __init find_last_devid_acpi(struct 
acpi_table_header *table, u16 pci_
 /* Allocate per PCI segment device table */
 static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
 {
-   pci_seg->dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO | 
GFP_DMA32,
- 
get_order(pci_seg->dev_table_size));
+   pci_seg->dev_table = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32,
+  
get_order(pci_seg->dev_table_size));
if (!pci_seg->dev_table)
return -ENOMEM;
 
@@ -658,17 +659,16 @@ static inline int __init alloc_dev_table(struct 
amd_iommu_pci_seg *pci_seg)
 
 static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
 {
-   free_pages((unsigned long)pci_seg->dev_table,
-   get_order(pci_seg->dev_table_size));
+   iommu_free_pages(pci_seg->dev_table,
+get_order(pci_seg->dev_table_size));
pci_seg->dev_table = NULL;
 }
 
 /* Allocate per PCI segment IOMMU rlookup table. */
 static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg)
 {
-   pci_seg->rlookup_table = (void *)__get_free_pages(
-   GFP_KERNEL | __GFP_ZERO,
-   
get_order(pci_seg->rlookup_table_size));
+   pci_seg->rlookup_table = iommu_alloc_pages(GFP_KERNEL,
+  
get_order(pci_seg->rlookup_table_size));
if (pci_seg->rlookup_table == NULL)
return -ENOMEM;
 
@@ -677,16 +677,15 @@ static inline int __init alloc_rlookup_table(struct 
amd_iommu_pci_seg *pci_seg)
 
 static inline void free_rlookup_table(struct amd_iommu_pci_seg *pci_seg)
 {
-   free_pages((unsigned long)pci_seg->rlookup_table,
-  get_order(pci_seg->rlookup_table_size));
+   iommu_free_pages(pci_seg->rlookup_table,
+get_order(pci_seg->rlookup_table_size));
pci_seg->rlookup_table = NULL;
 }
 
 static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg 
*pci_seg)
 {
-   pci_seg->irq_lookup_table = (void *)__get_free_pages(
-GFP_KERNEL | __GFP_ZERO,
-
get_order(pci_seg->rlookup_table_size));
+   pci_seg->irq_lookup_table = iommu_alloc_pages(GFP_KERNEL,
+ 
get_order(pci_seg->rlookup_table_size));
kmemleak_alloc(pci_seg->irq_lookup_table,
   pci_seg->rlookup_table_size, 1, GFP_KERNEL);
if (pci_seg->irq_lookup_table == NULL)
@@ -698,8 +697,8 @@ static inline int __init alloc_irq_lookup_table(struct 
amd_iommu_pci_seg *pci_se
 static inline void free_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg)
 {
kmemleak_free(pci_seg->irq_lookup_table);
-   free_pages((unsigned long)pci_seg->irq_lookup_table,
-  get_order(pci_seg->rlookup_table_size));
+   iommu_free_pages(pci_seg->irq_lookup_table,
+get_order(pci_seg->rlookup_table_size));
pci_seg->irq_lookup_table = NULL;
 }
 
@@ -707,8 +706,8 @@ static int __init alloc_alias_table(struct 
amd_iommu_pci_seg *pci_seg)
 {
int i;
 
-   pci_seg->alias_table = (void *)__get_free_pages(GFP_KERNEL,
-   get_order(pci_seg->alias_table_size));

[PATCH 04/16] iommu/io-pgtable-dart: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/io-pgtable-dart.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/io-pgtable-dart.c | 37 +
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c
index 74b1ef2b96be..ad28031e1e93 100644
--- a/drivers/iommu/io-pgtable-dart.c
+++ b/drivers/iommu/io-pgtable-dart.c
@@ -23,6 +23,7 @@
 #include 
 
 #include 
+#include "iommu-pages.h"
 
 #define DART1_MAX_ADDR_BITS36
 
@@ -106,18 +107,12 @@ static phys_addr_t iopte_to_paddr(dart_iopte pte,
return paddr;
 }
 
-static void *__dart_alloc_pages(size_t size, gfp_t gfp,
-   struct io_pgtable_cfg *cfg)
+static void *__dart_alloc_pages(size_t size, gfp_t gfp)
 {
int order = get_order(size);
-   struct page *p;
 
VM_BUG_ON((gfp & __GFP_HIGHMEM));
-   p = alloc_pages(gfp | __GFP_ZERO, order);
-   if (!p)
-   return NULL;
-
-   return page_address(p);
+   return iommu_alloc_pages(gfp, order);
 }
 
 static int dart_init_pte(struct dart_io_pgtable *data,
@@ -262,13 +257,13 @@ static int dart_map_pages(struct io_pgtable_ops *ops, 
unsigned long iova,
 
/* no L2 table present */
if (!pte) {
-   cptep = __dart_alloc_pages(tblsz, gfp, cfg);
+   cptep = __dart_alloc_pages(tblsz, gfp);
if (!cptep)
return -ENOMEM;
 
pte = dart_install_table(cptep, ptep, 0, data);
if (pte)
-   free_pages((unsigned long)cptep, get_order(tblsz));
+   iommu_free_pages(cptep, get_order(tblsz));
 
/* L2 table is present (now) */
pte = READ_ONCE(*ptep);
@@ -419,8 +414,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void 
*cookie)
cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits;
 
for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) {
-   data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), 
GFP_KERNEL,
-  cfg);
+   data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), 
GFP_KERNEL);
if (!data->pgd[i])
goto out_free_data;
cfg->apple_dart_cfg.ttbr[i] = virt_to_phys(data->pgd[i]);
@@ -429,9 +423,10 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void 
*cookie)
return &data->iop;
 
 out_free_data:
-   while (--i >= 0)
-   free_pages((unsigned long)data->pgd[i],
-  get_order(DART_GRANULE(data)));
+   while (--i >= 0) {
+   iommu_free_pages(data->pgd[i],
+get_order(DART_GRANULE(data)));
+   }
kfree(data);
return NULL;
 }
@@ -439,6 +434,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void 
*cookie)
 static void apple_dart_free_pgtable(struct io_pgtable *iop)
 {
struct dart_io_pgtable *data = io_pgtable_to_data(iop);
+   int order = get_order(DART_GRANULE(data));
dart_iopte *ptep, *end;
int i;
 
@@ -449,15 +445,10 @@ static void apple_dart_free_pgtable(struct io_pgtable 
*iop)
while (ptep != end) {
dart_iopte pte = *ptep++;
 
-   if (pte) {
-   unsigned long page =
-   (unsigned long)iopte_deref(pte, data);
-
-   free_pages(page, get_order(DART_GRANULE(data)));
-   }
+   if (pte)
+   iommu_free_pages(iopte_deref(pte, data), order);
}
-   free_pages((unsigned long)data->pgd[i],
-  get_order(DART_GRANULE(data)));
+   iommu_free_pages(data->pgd[i], order);
}
 
kfree(data);
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 05/16] iommu/io-pgtable-arm-v7s: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/io-pgtable-arm-v7s.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/io-pgtable-arm-v7s.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
b/drivers/iommu/io-pgtable-arm-v7s.c
index 75f244a3e12d..3d494ca1f671 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -34,6 +34,7 @@
 #include 
 
 #include 
+#include "iommu-pages.h"
 
 /* Struct accessors */
 #define io_pgtable_to_data(x)  \
@@ -255,7 +256,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 GFP_KERNEL : ARM_V7S_TABLE_GFP_DMA;
 
if (lvl == 1)
-   table = (void *)__get_free_pages(gfp_l1 | __GFP_ZERO, 
get_order(size));
+   table = iommu_alloc_pages(gfp_l1, get_order(size));
else if (lvl == 2)
table = kmem_cache_zalloc(data->l2_tables, gfp);
 
@@ -283,6 +284,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
}
if (lvl == 2)
kmemleak_ignore(table);
+
return table;
 
 out_unmap:
@@ -290,7 +292,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
 out_free:
if (lvl == 1)
-   free_pages((unsigned long)table, get_order(size));
+   iommu_free_pages(table, get_order(size));
else
kmem_cache_free(data->l2_tables, table);
return NULL;
@@ -306,8 +308,9 @@ static void __arm_v7s_free_table(void *table, int lvl,
if (!cfg->coherent_walk)
dma_unmap_single(dev, __arm_v7s_dma_addr(table), size,
 DMA_TO_DEVICE);
+
if (lvl == 1)
-   free_pages((unsigned long)table, get_order(size));
+   iommu_free_pages(table, get_order(size));
else
kmem_cache_free(data->l2_tables, table);
 }
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 06/16] iommu/dma: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/dma-iommu.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/dma-iommu.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 85163a83df2f..822adad464c2 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -31,6 +31,7 @@
 #include 
 
 #include "dma-iommu.h"
+#include "iommu-pages.h"
 
 struct iommu_dma_msi_page {
struct list_headlist;
@@ -874,7 +875,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
phys_addr_t phys,
 static void __iommu_dma_free_pages(struct page **pages, int count)
 {
while (count--)
-   __free_page(pages[count]);
+   __iommu_free_page(pages[count]);
kvfree(pages);
 }
 
@@ -912,7 +913,8 @@ static struct page **__iommu_dma_alloc_pages(struct device 
*dev,
order_size = 1U << order;
if (order_mask > order_size)
alloc_flags |= __GFP_NORETRY;
-   page = alloc_pages_node(nid, alloc_flags, order);
+   page = __iommu_alloc_pages_node(nid, alloc_flags,
+   order);
if (!page)
continue;
if (order)
@@ -1572,7 +1574,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, 
size_t size,
 
page = dma_alloc_contiguous(dev, alloc_size, gfp);
if (!page)
-   page = alloc_pages_node(node, gfp, get_order(alloc_size));
+   page = __iommu_alloc_pages_node(node, gfp, 
get_order(alloc_size));
if (!page)
return NULL;
 
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 07/16] iommu/exynos: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/exynos-iommu.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/exynos-iommu.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 2c6e9094f1e9..3eab0ae65a4f 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 
+#include "iommu-pages.h"
+
 typedef u32 sysmmu_iova_t;
 typedef u32 sysmmu_pte_t;
 static struct iommu_domain exynos_identity_domain;
@@ -900,11 +902,11 @@ static struct iommu_domain 
*exynos_iommu_domain_alloc_paging(struct device *dev)
if (!domain)
return NULL;
 
-   domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2);
+   domain->pgtable = iommu_alloc_pages(GFP_KERNEL, 2);
if (!domain->pgtable)
goto err_pgtable;
 
-   domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 
1);
+   domain->lv2entcnt = iommu_alloc_pages(GFP_KERNEL, 1);
if (!domain->lv2entcnt)
goto err_counter;
 
@@ -930,9 +932,9 @@ static struct iommu_domain 
*exynos_iommu_domain_alloc_paging(struct device *dev)
return &domain->domain;
 
 err_lv2ent:
-   free_pages((unsigned long)domain->lv2entcnt, 1);
+   iommu_free_pages(domain->lv2entcnt, 1);
 err_counter:
-   free_pages((unsigned long)domain->pgtable, 2);
+   iommu_free_pages(domain->pgtable, 2);
 err_pgtable:
kfree(domain);
return NULL;
@@ -973,8 +975,8 @@ static void exynos_iommu_domain_free(struct iommu_domain 
*iommu_domain)
phys_to_virt(base));
}
 
-   free_pages((unsigned long)domain->pgtable, 2);
-   free_pages((unsigned long)domain->lv2entcnt, 1);
+   iommu_free_pages(domain->pgtable, 2);
+   iommu_free_pages(domain->lv2entcnt, 1);
kfree(domain);
 }
 
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 08/16] iommu/fsl: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/fsl_pamu.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/fsl_pamu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
index f37d3b044131..7bfb49940f0c 100644
--- a/drivers/iommu/fsl_pamu.c
+++ b/drivers/iommu/fsl_pamu.c
@@ -16,6 +16,7 @@
 #include 
 
 #include 
+#include "iommu-pages.h"
 
 /* define indexes for each operation mapping scenario */
 #define OMI_QMAN0x00
@@ -828,7 +829,7 @@ static int fsl_pamu_probe(struct platform_device *pdev)
(PAGE_SIZE << get_order(OMT_SIZE));
order = get_order(mem_size);
 
-   p = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+   p = __iommu_alloc_pages(GFP_KERNEL, order);
if (!p) {
dev_err(dev, "unable to allocate PAACT/SPAACT/OMT block\n");
ret = -ENOMEM;
@@ -916,7 +917,7 @@ static int fsl_pamu_probe(struct platform_device *pdev)
iounmap(guts_regs);
 
if (ppaact)
-   free_pages((unsigned long)ppaact, order);
+   iommu_free_pages(ppaact, order);
 
ppaact = NULL;
 
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 09/16] iommu/iommufd: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/iommufd/* files to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/iommufd/iova_bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c 
b/drivers/iommu/iommufd/iova_bitmap.c
index 0a92c9eeaf7f..6b8575b93f17 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -253,7 +253,7 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, 
size_t length,
bitmap->iova = iova;
bitmap->length = length;
mapped->iova = iova;
-   mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
+   mapped->pages = iommu_alloc_page(GFP_KERNEL);
if (!mapped->pages) {
rc = -ENOMEM;
goto err;
@@ -284,7 +284,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
iova_bitmap_put(bitmap);
 
if (mapped->pages) {
-   free_page((unsigned long)mapped->pages);
+   iommu_free_page(mapped->pages);
mapped->pages = NULL;
}
 
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 11/16] iommu/sun50i: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/sun50i-iommu.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/sun50i-iommu.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index 41484a5a399b..172ddb717eb5 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -26,6 +26,8 @@
 #include 
 #include 
 
+#include "iommu-pages.h"
+
 #define IOMMU_RESET_REG0x010
 #define IOMMU_RESET_RELEASE_ALL0x
 #define IOMMU_ENABLE_REG   0x020
@@ -679,8 +681,7 @@ sun50i_iommu_domain_alloc_paging(struct device *dev)
if (!sun50i_domain)
return NULL;
 
-   sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-   get_order(DT_SIZE));
+   sun50i_domain->dt = iommu_alloc_pages(GFP_KERNEL, get_order(DT_SIZE));
if (!sun50i_domain->dt)
goto err_free_domain;
 
@@ -702,7 +703,7 @@ static void sun50i_iommu_domain_free(struct iommu_domain 
*domain)
 {
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
 
-   free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
+   iommu_free_pages(sun50i_domain->dt, get_order(DT_SIZE));
sun50i_domain->dt = NULL;
 
kfree(sun50i_domain);
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 10/16] iommu/rockchip: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
Convert iommu/rockchip-iommu.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin 
---
 drivers/iommu/iommufd/iova_bitmap.c |  2 ++
 drivers/iommu/rockchip-iommu.c  | 14 --
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c 
b/drivers/iommu/iommufd/iova_bitmap.c
index 6b8575b93f17..4d5d1be807fe 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -8,6 +8,8 @@
 #include 
 #include 
 
+#include "../iommu-pages.h"
+
 #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
 
 /*
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 2685861c0a12..e04f22d481d0 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -26,6 +26,8 @@
 #include 
 #include 
 
+#include "iommu-pages.h"
+
 /** MMU register offsets */
 #define RK_MMU_DTE_ADDR0x00/* Directory table address */
 #define RK_MMU_STATUS  0x04
@@ -727,14 +729,14 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain 
*rk_domain,
if (rk_dte_is_pt_valid(dte))
goto done;
 
-   page_table = (u32 *)get_zeroed_page(GFP_ATOMIC | rk_ops->gfp_flags);
+   page_table = iommu_alloc_page(GFP_ATOMIC | rk_ops->gfp_flags);
if (!page_table)
return ERR_PTR(-ENOMEM);
 
pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE);
if (dma_mapping_error(dma_dev, pt_dma)) {
dev_err(dma_dev, "DMA mapping error while allocating page 
table\n");
-   free_page((unsigned long)page_table);
+   iommu_free_page(page_table);
return ERR_PTR(-ENOMEM);
}
 
@@ -1061,7 +1063,7 @@ static struct iommu_domain 
*rk_iommu_domain_alloc_paging(struct device *dev)
 * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
 * Allocate one 4 KiB page for each table.
 */
-   rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | rk_ops->gfp_flags);
+   rk_domain->dt = iommu_alloc_page(GFP_KERNEL | rk_ops->gfp_flags);
if (!rk_domain->dt)
goto err_free_domain;
 
@@ -1083,7 +1085,7 @@ static struct iommu_domain 
*rk_iommu_domain_alloc_paging(struct device *dev)
return &rk_domain->domain;
 
 err_free_dt:
-   free_page((unsigned long)rk_domain->dt);
+   iommu_free_page(rk_domain->dt);
 err_free_domain:
kfree(rk_domain);
 
@@ -1104,13 +1106,13 @@ static void rk_iommu_domain_free(struct iommu_domain 
*domain)
u32 *page_table = phys_to_virt(pt_phys);
dma_unmap_single(dma_dev, pt_phys,
 SPAGE_SIZE, DMA_TO_DEVICE);
-   free_page((unsigned long)page_table);
+   iommu_free_page(page_table);
}
}
 
dma_unmap_single(dma_dev, rk_domain->dt_dma,
 SPAGE_SIZE, DMA_TO_DEVICE);
-   free_page((unsigned long)rk_domain->dt);
+   iommu_free_page(rk_domain->dt);
 
kfree(rk_domain);
 }
-- 
2.43.0.rc2.451.g8631bc7472-goog




[PATCH 16/16] vfio: account iommu allocations

2023-11-28 Thread Pasha Tatashin
iommu allocations should be accounted in order to allow admins to
monitor and limit the amount of iommu memory.

Signed-off-by: Pasha Tatashin 
---
 drivers/vfio/vfio_iommu_type1.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index eacd6ec04de5..b2854d7939ce 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1436,7 +1436,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
dma_addr_t iova,
list_for_each_entry(d, &iommu->domain_list, next) {
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
npage << PAGE_SHIFT, prot | IOMMU_CACHE,
-   GFP_KERNEL);
+   GFP_KERNEL_ACCOUNT);
if (ret)
goto unwind;
 
@@ -1750,7 +1750,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
}
 
ret = iommu_map(domain->domain, iova, phys, size,
-   dma->prot | IOMMU_CACHE, GFP_KERNEL);
+   dma->prot | IOMMU_CACHE,
+   GFP_KERNEL_ACCOUNT);
if (ret) {
if (!dma->iommu_mapped) {
vfio_unpin_pages_remote(dma, iova,
@@ -1845,7 +1846,8 @@ static void vfio_test_domain_fgsp(struct vfio_domain 
*domain, struct list_head *
continue;
 
ret = iommu_map(domain->domain, start, page_to_phys(pages), 
PAGE_SIZE * 2,
-   IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, 
GFP_KERNEL);
+   IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
+   GFP_KERNEL_ACCOUNT);
if (!ret) {
size_t unmapped = iommu_unmap(domain->domain, start, 
PAGE_SIZE);
 
-- 
2.43.0.rc2.451.g8631bc7472-goog




Re: [PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Yosry Ahmed
On Tue, Nov 28, 2023 at 12:49 PM Pasha Tatashin
 wrote:
>
> From: Pasha Tatashin 
>
> IOMMU subsystem may contain state that is in gigabytes. Majority of that
> state is iommu page tables. Yet, there is currently, no way to observe
> how much memory is actually used by the iommu subsystem.
>
> This patch series solves this problem by adding both observability to
> all pages that are allocated by IOMMU, and also accountability, so
> admins can limit the amount if via cgroups.
>
> The system-wide observability is using /proc/meminfo:
> SecPageTables:438176 kB
>
> Contains IOMMU and KVM memory.
>
> Per-node observability:
> /sys/devices/system/node/nodeN/meminfo
> Node N SecPageTables:422204 kB
>
> Contains IOMMU and KVM memory memory in the given NUMA node.
>
> Per-node IOMMU only observability:
> /sys/devices/system/node/nodeN/vmstat
> nr_iommu_pages 10
>
> Contains number of pages IOMMU allocated in the given node.

Does it make sense to have a KVM-only entry there as well?

In that case, if SecPageTables in /proc/meminfo is found to be
suspiciously high, it should be easy to tell which component is
contributing most usage through vmstat. I understand that users can do
the subtraction, but we wouldn't want userspace depending on that, in
case a third class of "secondary" page tables emerges that we want to
add to SecPageTables. The in-kernel implementation can do the
subtraction for now if it makes sense though.

>
> Accountability: using sec_pagetables cgroup-v2 memory.stat entry.
>
> With the change, iova_stress[1] stops as limit is reached:
>
> # ./iova_stress
> iova space: 0T  free memory:   497G
> iova space: 1T  free memory:   495G
> iova space: 2T  free memory:   493G
> iova space: 3T  free memory:   491G
>
> stops as limit is reached.
>
> This series encorporates suggestions that came from the discussion
> at LPC [2].
>
> [1] https://github.com/soleen/iova_stress
> [2] https://lpc.events/event/17/contributions/1466
>
> Pasha Tatashin (16):
>   iommu/vt-d: add wrapper functions for page allocations
>   iommu/amd: use page allocation function provided by iommu-pages.h
>   iommu/io-pgtable-arm: use page allocation function provided by
> iommu-pages.h
>   iommu/io-pgtable-dart: use page allocation function provided by
> iommu-pages.h
>   iommu/io-pgtable-arm-v7s: use page allocation function provided by
> iommu-pages.h
>   iommu/dma: use page allocation function provided by iommu-pages.h
>   iommu/exynos: use page allocation function provided by iommu-pages.h
>   iommu/fsl: use page allocation function provided by iommu-pages.h
>   iommu/iommufd: use page allocation function provided by iommu-pages.h
>   iommu/rockchip: use page allocation function provided by iommu-pages.h
>   iommu/sun50i: use page allocation function provided by iommu-pages.h
>   iommu/tegra-smmu: use page allocation function provided by
> iommu-pages.h
>   iommu: observability of the IOMMU allocations
>   iommu: account IOMMU allocated memory
>   vhost-vdpa: account iommu allocations
>   vfio: account iommu allocations
>
>  Documentation/admin-guide/cgroup-v2.rst |   2 +-
>  Documentation/filesystems/proc.rst  |   4 +-
>  drivers/iommu/amd/amd_iommu.h   |   8 -
>  drivers/iommu/amd/init.c|  91 +-
>  drivers/iommu/amd/io_pgtable.c  |  13 +-
>  drivers/iommu/amd/io_pgtable_v2.c   |  20 +-
>  drivers/iommu/amd/iommu.c   |  13 +-
>  drivers/iommu/dma-iommu.c   |   8 +-
>  drivers/iommu/exynos-iommu.c|  14 +-
>  drivers/iommu/fsl_pamu.c|   5 +-
>  drivers/iommu/intel/dmar.c  |  10 +-
>  drivers/iommu/intel/iommu.c |  47 ++---
>  drivers/iommu/intel/iommu.h |   2 -
>  drivers/iommu/intel/irq_remapping.c |  10 +-
>  drivers/iommu/intel/pasid.c |  12 +-
>  drivers/iommu/intel/svm.c   |   7 +-
>  drivers/iommu/io-pgtable-arm-v7s.c  |   9 +-
>  drivers/iommu/io-pgtable-arm.c  |   7 +-
>  drivers/iommu/io-pgtable-dart.c |  37 ++--
>  drivers/iommu/iommu-pages.h | 231 
>  drivers/iommu/iommufd/iova_bitmap.c |   6 +-
>  drivers/iommu/rockchip-iommu.c  |  14 +-
>  drivers/iommu/sun50i-iommu.c|   7 +-
>  drivers/iommu/tegra-smmu.c  |  18 +-
>  drivers/vfio/vfio_iommu_type1.c |   8 +-
>  drivers/vhost/vdpa.c|   3 +-
>  include/linux/mmzone.h  |   5 +-
>  mm/vmstat.c |   3 +
>  28 files changed, 415 insertions(+), 199 deletions(-)
>  create mode 100644 drivers/iommu/iommu-pages.h
>
> --
> 2.43.0.rc2.451.g8631bc7472-goog
>
>



Re: [PATCH 06/16] iommu/dma: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
On Tue, Nov 28, 2023 at 5:34 PM Robin Murphy  wrote:
>
> On 2023-11-28 8:49 pm, Pasha Tatashin wrote:
> > Convert iommu/dma-iommu.c to use the new page allocation functions
> > provided in iommu-pages.h.
>
> These have nothing to do with IOMMU pagetables, they are DMA buffers and
> they belong to whoever called the corresponding dma_alloc_* function.

Hi Robin,

This is true, however, we want to account and observe the pages
allocated by IOMMU subsystem for DMA buffers, as they are essentially
unmovable locked pages. Should we separate IOMMU memory from KVM
memory all together and add another field to /proc/meminfo, something
like "iommu -> iommu pagetable and dma memory", or do we want to
export DMA memory separately from IOMMU page tables?

Since, I included DMA memory, I specifically removed mentioning of
IOMMU page tables in the most of places, and only report it as IOMMU
memory. However, since it is still bundled together with SecPageTables
it can be confusing.

Pasha



Re: [PATCH 05/16] iommu/io-pgtable-arm-v7s: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
> >   kmem_cache_free(data->l2_tables, table);

We only account page allocations, not subpages, however, this is
something I was surprised about this particular architecture of why do
we allocate l2 using kmem ? Are the second level tables on arm v7s
really sub-page in size?

Pasha



Re: [PATCH 06/16] iommu/dma: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Robin Murphy

On 2023-11-28 10:50 pm, Pasha Tatashin wrote:

On Tue, Nov 28, 2023 at 5:34 PM Robin Murphy  wrote:


On 2023-11-28 8:49 pm, Pasha Tatashin wrote:

Convert iommu/dma-iommu.c to use the new page allocation functions
provided in iommu-pages.h.


These have nothing to do with IOMMU pagetables, they are DMA buffers and
they belong to whoever called the corresponding dma_alloc_* function.


Hi Robin,

This is true, however, we want to account and observe the pages
allocated by IOMMU subsystem for DMA buffers, as they are essentially
unmovable locked pages. Should we separate IOMMU memory from KVM
memory all together and add another field to /proc/meminfo, something
like "iommu -> iommu pagetable and dma memory", or do we want to
export DMA memory separately from IOMMU page tables?


These are not allocated by "the IOMMU subsystem", they are allocated by 
the DMA API. Even if you want to claim that a driver pinning memory via 
iommu_dma_ops is somehow different from the same driver pinning the same 
amount of memory via dma-direct when iommu.passthrough=1, it's still 
nonsense because you're failing to account the pages which iommu_dma_ops 
gets from CMA, dma_common_alloc_pages(), dynamic SWIOTLB, the various 
pools, and so on.


Thanks,
Robin.


Since, I included DMA memory, I specifically removed mentioning of
IOMMU page tables in the most of places, and only report it as IOMMU
memory. However, since it is still bundled together with SecPageTables
it can be confusing.

Pasha




Re: [PATCH 08/16] iommu/fsl: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
On Tue, Nov 28, 2023 at 5:53 PM Robin Murphy  wrote:
>
> On 2023-11-28 8:49 pm, Pasha Tatashin wrote:
> > Convert iommu/fsl_pamu.c to use the new page allocation functions
> > provided in iommu-pages.h.
>
> Again, this is not a pagetable. This thing doesn't even *have* pagetables.
>
> Similar to patches #1 and #2 where you're lumping in configuration
> tables which belong to the IOMMU driver itself, as opposed to pagetables
> which effectively belong to an IOMMU domain's user. But then there are
> still drivers where you're *not* accounting similar configuration
> structures, so I really struggle to see how this metric is useful when
> it's so completely inconsistent in what it's counting :/

The whole IOMMU subsystem allocates a significant amount of kernel
locked memory that we want to at least observe. The new field in
vmstat does just that: it reports ALL buddy allocator memory that
IOMMU allocates. However, for accounting purposes, I agree, we need to
do better, and separate at least iommu pagetables from the rest.

We can separate the metric into two:
iommu pagetable only
iommu everything

or into three:
iommu pagetable only
iommu dma
iommu everything

What do you think?

Pasha

>
> Thanks,
> Robin.
>
> > Signed-off-by: Pasha Tatashin 
> > ---
> >   drivers/iommu/fsl_pamu.c | 5 +++--
> >   1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
> > index f37d3b044131..7bfb49940f0c 100644
> > --- a/drivers/iommu/fsl_pamu.c
> > +++ b/drivers/iommu/fsl_pamu.c
> > @@ -16,6 +16,7 @@
> >   #include 
> >
> >   #include 
> > +#include "iommu-pages.h"
> >
> >   /* define indexes for each operation mapping scenario */
> >   #define OMI_QMAN0x00
> > @@ -828,7 +829,7 @@ static int fsl_pamu_probe(struct platform_device *pdev)
> >   (PAGE_SIZE << get_order(OMT_SIZE));
> >   order = get_order(mem_size);
> >
> > - p = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
> > + p = __iommu_alloc_pages(GFP_KERNEL, order);
> >   if (!p) {
> >   dev_err(dev, "unable to allocate PAACT/SPAACT/OMT block\n");
> >   ret = -ENOMEM;
> > @@ -916,7 +917,7 @@ static int fsl_pamu_probe(struct platform_device *pdev)
> >   iounmap(guts_regs);
> >
> >   if (ppaact)
> > - free_pages((unsigned long)ppaact, order);
> > + iommu_free_pages(ppaact, order);
> >
> >   ppaact = NULL;
> >



Re: [PATCH 06/16] iommu/dma: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
On Tue, Nov 28, 2023 at 5:59 PM Robin Murphy  wrote:
>
> On 2023-11-28 10:50 pm, Pasha Tatashin wrote:
> > On Tue, Nov 28, 2023 at 5:34 PM Robin Murphy  wrote:
> >>
> >> On 2023-11-28 8:49 pm, Pasha Tatashin wrote:
> >>> Convert iommu/dma-iommu.c to use the new page allocation functions
> >>> provided in iommu-pages.h.
> >>
> >> These have nothing to do with IOMMU pagetables, they are DMA buffers and
> >> they belong to whoever called the corresponding dma_alloc_* function.
> >
> > Hi Robin,
> >
> > This is true, however, we want to account and observe the pages
> > allocated by IOMMU subsystem for DMA buffers, as they are essentially
> > unmovable locked pages. Should we separate IOMMU memory from KVM
> > memory all together and add another field to /proc/meminfo, something
> > like "iommu -> iommu pagetable and dma memory", or do we want to
> > export DMA memory separately from IOMMU page tables?
>
> These are not allocated by "the IOMMU subsystem", they are allocated by
> the DMA API. Even if you want to claim that a driver pinning memory via
> iommu_dma_ops is somehow different from the same driver pinning the same
> amount of memory via dma-direct when iommu.passthrough=1, it's still
> nonsense because you're failing to account the pages which iommu_dma_ops
> gets from CMA, dma_common_alloc_pages(), dynamic SWIOTLB, the various
> pools, and so on.
>
> Thanks,
> Robin.
>
> > Since, I included DMA memory, I specifically removed mentioning of
> > IOMMU page tables in the most of places, and only report it as IOMMU
> > memory. However, since it is still bundled together with SecPageTables
> > it can be confusing.
> >
> > Pasha



Re: [PATCH 06/16] iommu/dma: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
> > This is true, however, we want to account and observe the pages
> > allocated by IOMMU subsystem for DMA buffers, as they are essentially
> > unmovable locked pages. Should we separate IOMMU memory from KVM
> > memory all together and add another field to /proc/meminfo, something
> > like "iommu -> iommu pagetable and dma memory", or do we want to
> > export DMA memory separately from IOMMU page tables?
>
> These are not allocated by "the IOMMU subsystem", they are allocated by
> the DMA API. Even if you want to claim that a driver pinning memory via
> iommu_dma_ops is somehow different from the same driver pinning the same
> amount of memory via dma-direct when iommu.passthrough=1, it's still
> nonsense because you're failing to account the pages which iommu_dma_ops
> gets from CMA, dma_common_alloc_pages(), dynamic SWIOTLB, the various
> pools, and so on.

I see, IOMMU variants are used only for discontiguous allocations, and
the common ones are defined outside of driver/iommu. Alright, I can
remove all the changes for all no-page table related IOMMU
allocations.

Pasha



Re: [PATCH 05/16] iommu/io-pgtable-arm-v7s: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Pasha Tatashin
On Tue, Nov 28, 2023 at 6:08 PM Robin Murphy  wrote:
>
> On 2023-11-28 10:55 pm, Pasha Tatashin wrote:
> >>>kmem_cache_free(data->l2_tables, table);
> >
> > We only account page allocations, not subpages, however, this is
> > something I was surprised about this particular architecture of why do
> > we allocate l2 using kmem ? Are the second level tables on arm v7s
> > really sub-page in size?
>
> Yes, L2 tables are 1KB, so the kmem_cache could still quite easily end
> up consuming significantly more memory than the L1 table, which is
> usually 16KB (but could potentially be smaller depending on the config,
> or up to 64KB with the Mediatek hacks).

I am OK removing support for this architecture, or keeping only info
for L1, I do not think there is a reason to worry about sub-page
accounting only for v7s.

Pasha

>
> Thanks,
> Robin.



Re: [PATCH 08/16] iommu/fsl: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 06:00:13PM -0500, Pasha Tatashin wrote:
> On Tue, Nov 28, 2023 at 5:53 PM Robin Murphy  wrote:
> >
> > On 2023-11-28 8:49 pm, Pasha Tatashin wrote:
> > > Convert iommu/fsl_pamu.c to use the new page allocation functions
> > > provided in iommu-pages.h.
> >
> > Again, this is not a pagetable. This thing doesn't even *have* pagetables.
> >
> > Similar to patches #1 and #2 where you're lumping in configuration
> > tables which belong to the IOMMU driver itself, as opposed to pagetables
> > which effectively belong to an IOMMU domain's user. But then there are
> > still drivers where you're *not* accounting similar configuration
> > structures, so I really struggle to see how this metric is useful when
> > it's so completely inconsistent in what it's counting :/
> 
> The whole IOMMU subsystem allocates a significant amount of kernel
> locked memory that we want to at least observe. The new field in
> vmstat does just that: it reports ALL buddy allocator memory that
> IOMMU allocates. However, for accounting purposes, I agree, we need to
> do better, and separate at least iommu pagetables from the rest.
> 
> We can separate the metric into two:
> iommu pagetable only
> iommu everything
> 
> or into three:
> iommu pagetable only
> iommu dma
> iommu everything
> 
> What do you think?

I think I said this at LPC - if you want to have fine grained
accounting of memory by owner you need to go talk to the cgroup people
and come up with something generic. Adding ever open coded finer
category breakdowns just for iommu doesn't make alot of sense.

You can make some argument that the pagetable memory should be counted
because kvm counts it's shadow memory, but I wouldn't go into further
detail than that with hand coded counters..

Jason



Re: [PATCH 09/16] iommu/iommufd: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 08:49:31PM +, Pasha Tatashin wrote:
> Convert iommu/iommufd/* files to use the new page allocation functions
> provided in iommu-pages.h.
> 
> Signed-off-by: Pasha Tatashin 
> ---
>  drivers/iommu/iommufd/iova_bitmap.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

This is a short term allocation, it should not be counted, that is why
it is already not using GFP_KERNEL_ACCOUNT.

Jason



Re: [PATCH 16/16] vfio: account iommu allocations

2023-11-28 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 08:49:38PM +, Pasha Tatashin wrote:
> iommu allocations should be accounted in order to allow admins to
> monitor and limit the amount of iommu memory.
> 
> Signed-off-by: Pasha Tatashin 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)

You should send the seperately and directly to Alex.

Jason



Re: [PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Yosry Ahmed
On Tue, Nov 28, 2023 at 3:52 PM Jason Gunthorpe  wrote:
>
> On Tue, Nov 28, 2023 at 03:03:30PM -0800, Yosry Ahmed wrote:
> > > Yes, another counter for KVM could be added. On the other hand KVM
> > > only can be computed by subtracting one from another as there are only
> > > two types of secondary page tables, KVM and IOMMU:
> > >
> > > /sys/devices/system/node/node0/meminfo
> > > Node 0 SecPageTables:422204 kB
> > >
> > >  /sys/devices/system/node/nodeN/vmstat
> > > nr_iommu_pages 10
> > >
> > > KVM only = SecPageTables - nr_iommu_pages * PAGE_SIZE / 1024
> > >
> >
> > Right, but as I mention above, if userspace starts depending on this
> > equation, we won't be able to add any more classes of "secondary" page
> > tables to SecPageTables. I'd like to avoid that if possible. We can do
> > the subtraction in the kernel.
>
> What Sean had suggested was that SecPageTables was always intended to
> account all the non-primary mmu memory used by page tables. If this is
> the case we shouldn't be trying to break it apart into finer
> counters. These are big picture counters, not detailed allocation by
> owner counters.

Right, I agree with that, but if SecPageTables includes page tables
from multiple sources, and it is observed to be suspiciously high, the
logical next step is to try to find the culprit, right?

>
> Jason



Re: [PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 04:25:03PM -0800, Yosry Ahmed wrote:

> > > Right, but as I mention above, if userspace starts depending on this
> > > equation, we won't be able to add any more classes of "secondary" page
> > > tables to SecPageTables. I'd like to avoid that if possible. We can do
> > > the subtraction in the kernel.
> >
> > What Sean had suggested was that SecPageTables was always intended to
> > account all the non-primary mmu memory used by page tables. If this is
> > the case we shouldn't be trying to break it apart into finer
> > counters. These are big picture counters, not detailed allocation by
> > owner counters.
> 
> Right, I agree with that, but if SecPageTables includes page tables
> from multiple sources, and it is observed to be suspiciously high, the
> logical next step is to try to find the culprit, right?

You can make that case already, if it is high wouldn't you want to
find the exact VMM process that was making it high?

It is a sign of fire, not a detailed debug tool.

Jason



Re: [PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Yosry Ahmed
On Tue, Nov 28, 2023 at 4:28 PM Jason Gunthorpe  wrote:
>
> On Tue, Nov 28, 2023 at 04:25:03PM -0800, Yosry Ahmed wrote:
>
> > > > Right, but as I mention above, if userspace starts depending on this
> > > > equation, we won't be able to add any more classes of "secondary" page
> > > > tables to SecPageTables. I'd like to avoid that if possible. We can do
> > > > the subtraction in the kernel.
> > >
> > > What Sean had suggested was that SecPageTables was always intended to
> > > account all the non-primary mmu memory used by page tables. If this is
> > > the case we shouldn't be trying to break it apart into finer
> > > counters. These are big picture counters, not detailed allocation by
> > > owner counters.
> >
> > Right, I agree with that, but if SecPageTables includes page tables
> > from multiple sources, and it is observed to be suspiciously high, the
> > logical next step is to try to find the culprit, right?
>
> You can make that case already, if it is high wouldn't you want to
> find the exact VMM process that was making it high?
>
> It is a sign of fire, not a detailed debug tool.

Fair enough. We can always add separate counters later if needed,
potentially under KVM stats to get more fine-grained details as you
mentioned.

I am only worried about users subtracting the iommu-only counter to
get a KVM counter. We should at least document that  SecPageTables may
be expanded to include other sources later to avoid that.

>
> Jason



Re: [PATCH 00/16] IOMMU memory observability

2023-11-28 Thread Jason Gunthorpe
On Tue, Nov 28, 2023 at 04:30:27PM -0800, Yosry Ahmed wrote:
> On Tue, Nov 28, 2023 at 4:28 PM Jason Gunthorpe  wrote:
> >
> > On Tue, Nov 28, 2023 at 04:25:03PM -0800, Yosry Ahmed wrote:
> >
> > > > > Right, but as I mention above, if userspace starts depending on this
> > > > > equation, we won't be able to add any more classes of "secondary" page
> > > > > tables to SecPageTables. I'd like to avoid that if possible. We can do
> > > > > the subtraction in the kernel.
> > > >
> > > > What Sean had suggested was that SecPageTables was always intended to
> > > > account all the non-primary mmu memory used by page tables. If this is
> > > > the case we shouldn't be trying to break it apart into finer
> > > > counters. These are big picture counters, not detailed allocation by
> > > > owner counters.
> > >
> > > Right, I agree with that, but if SecPageTables includes page tables
> > > from multiple sources, and it is observed to be suspiciously high, the
> > > logical next step is to try to find the culprit, right?
> >
> > You can make that case already, if it is high wouldn't you want to
> > find the exact VMM process that was making it high?
> >
> > It is a sign of fire, not a detailed debug tool.
> 
> Fair enough. We can always add separate counters later if needed,
> potentially under KVM stats to get more fine-grained details as you
> mentioned.
> 
> I am only worried about users subtracting the iommu-only counter to
> get a KVM counter. We should at least document that  SecPageTables may
> be expanded to include other sources later to avoid that.

Well, we just broke it already, anyone thinking it was only kvm
counters is going to be sad now :) As I understand it was already
described to be more general that kvm so probably nothing to do really

Jason



Re: [PATCH 04/16] iommu/io-pgtable-dart: use page allocation function provided by iommu-pages.h

2023-11-28 Thread Janne Grunau
Hej,

On Tue, Nov 28, 2023, at 21:49, Pasha Tatashin wrote:
> Convert iommu/io-pgtable-dart.c to use the new page allocation functions
> provided in iommu-pages.h.
>
> Signed-off-by: Pasha Tatashin 
> ---
>  drivers/iommu/io-pgtable-dart.c | 37 +
>  1 file changed, 14 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c
> index 74b1ef2b96be..ad28031e1e93 100644
> --- a/drivers/iommu/io-pgtable-dart.c
> +++ b/drivers/iommu/io-pgtable-dart.c
> @@ -23,6 +23,7 @@
>  #include 
> 
>  #include 
> +#include "iommu-pages.h"
> 
>  #define DART1_MAX_ADDR_BITS  36
> 
> @@ -106,18 +107,12 @@ static phys_addr_t iopte_to_paddr(dart_iopte pte,
>   return paddr;
>  }
> 
> -static void *__dart_alloc_pages(size_t size, gfp_t gfp,
> - struct io_pgtable_cfg *cfg)
> +static void *__dart_alloc_pages(size_t size, gfp_t gfp)
>  {
>   int order = get_order(size);
> - struct page *p;
> 
>   VM_BUG_ON((gfp & __GFP_HIGHMEM));
> - p = alloc_pages(gfp | __GFP_ZERO, order);
> - if (!p)
> - return NULL;
> -
> - return page_address(p);
> + return iommu_alloc_pages(gfp, order);
>  }
> 
>  static int dart_init_pte(struct dart_io_pgtable *data,
> @@ -262,13 +257,13 @@ static int dart_map_pages(struct io_pgtable_ops 
> *ops, unsigned long iova,
> 
>   /* no L2 table present */
>   if (!pte) {
> - cptep = __dart_alloc_pages(tblsz, gfp, cfg);
> + cptep = __dart_alloc_pages(tblsz, gfp);
>   if (!cptep)
>   return -ENOMEM;
> 
>   pte = dart_install_table(cptep, ptep, 0, data);
>   if (pte)
> - free_pages((unsigned long)cptep, get_order(tblsz));
> + iommu_free_pages(cptep, get_order(tblsz));
> 
>   /* L2 table is present (now) */
>   pte = READ_ONCE(*ptep);
> @@ -419,8 +414,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg 
> *cfg, void *cookie)
>   cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits;
> 
>   for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) {
> - data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), 
> GFP_KERNEL,
> -cfg);
> + data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), 
> GFP_KERNEL);
>   if (!data->pgd[i])
>   goto out_free_data;
>   cfg->apple_dart_cfg.ttbr[i] = virt_to_phys(data->pgd[i]);
> @@ -429,9 +423,10 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg 
> *cfg, void *cookie)
>   return &data->iop;
> 
>  out_free_data:
> - while (--i >= 0)
> - free_pages((unsigned long)data->pgd[i],
> -get_order(DART_GRANULE(data)));
> + while (--i >= 0) {
> + iommu_free_pages(data->pgd[i],
> +  get_order(DART_GRANULE(data)));
> + }
>   kfree(data);
>   return NULL;
>  }
> @@ -439,6 +434,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg 
> *cfg, void *cookie)
>  static void apple_dart_free_pgtable(struct io_pgtable *iop)
>  {
>   struct dart_io_pgtable *data = io_pgtable_to_data(iop);
> + int order = get_order(DART_GRANULE(data));
>   dart_iopte *ptep, *end;
>   int i;
> 
> @@ -449,15 +445,10 @@ static void apple_dart_free_pgtable(struct 
> io_pgtable *iop)
>   while (ptep != end) {
>   dart_iopte pte = *ptep++;
> 
> - if (pte) {
> - unsigned long page =
> - (unsigned long)iopte_deref(pte, data);
> -
> - free_pages(page, get_order(DART_GRANULE(data)));
> - }
> + if (pte)
> + iommu_free_pages(iopte_deref(pte, data), order);
>   }
> - free_pages((unsigned long)data->pgd[i],
> -get_order(DART_GRANULE(data)));
> + iommu_free_pages(data->pgd[i], order);
>   }
> 
>   kfree(data);

Reviewed-by: Janne Grunau 

Janne