>-----Original Message-----
>From: dri-devel <dri-devel-boun...@lists.freedesktop.org> On Behalf Of
>Rajneesh Bhardwaj
>Sent: Friday, March 22, 2024 3:08 AM
>To: amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org
>Cc: felix.kuehl...@amd.com; alexander.deuc...@amd.com;
>christian.koe...@amd.com; Rajneesh Bhardwaj
><rajneesh.bhard...@amd.com>; Joe Greathouse
><joseph.greatho...@amd.com>
>Subject: [PATCH] drm/ttm: Implement strict NUMA pool allocations
>
>This change allows TTM to be flexible to honor NUMA localized
>allocations which can result in significant performance improvement on a
>multi socket NUMA system. On GFXIP 9.4.3 based AMD APUs, we see
>manyfold benefits of this change resulting not only in ~10% performance
>improvement in certain benchmarks but also generating more consistent
>and less sporadic results specially when the NUMA balancing is not
>explecitely disabled. In certain scenarios, workloads show a run-to-run
>variability e.g. HPL would show a ~10x performance drop after running
>back to back 4-5 times and would recover later on a subsequent run. This
>is seen with memory intensive other workloads too. It was seen that when
>CPU caches were dropped e.g. sudo sysctl -w vm.drop_caches=1, the
>variability reduced but the performance was still well below a good run.
>
>Use of __GFP_THISNODE flag ensures that during memory allocation, kernel
>prioritizes memory allocations from the local or closest NUMA node
>thereby reducing memory access latency. When memory is allocated using
>__GFP_THISNODE flag, memory allocations will predominantly be done on
>the local node, consequency, the shrinkers may priotitize reclaiming
>memory from caches assocoated with local node to maintain memory
>locality and minimize latency, thereby provide better shinker targeting.
>
>Reduced memory pressure on remote nodes, can also indirectly influence
>shrinker behavior by potentially reducing the frequency and intensity of
>memory reclamation operation on remote nodes and could provide improved
>overall system performance.
>
>While this change could be more beneficial in general, i.e., without the
>use of a module parameter, but in absence of widespread testing, limit
>it to the AMD GFXIP 9.4.3 based ttm pool initializations only.
>
>
>Cc: Joe Greathouse <joseph.greatho...@amd.com>
>Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhard...@amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  8 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  7 ++++++-
> drivers/gpu/drm/ttm/tests/ttm_pool_test.c | 10 +++++-----
> drivers/gpu/drm/ttm/ttm_device.c          |  2 +-
> drivers/gpu/drm/ttm/ttm_pool.c            |  7 ++++++-
> include/drm/ttm/ttm_pool.h                |  4 +++-
> 7 files changed, 30 insertions(+), 9 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>index 9c62552bec34..96532cfc6230 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>@@ -253,6 +253,7 @@ extern int amdgpu_user_partt_mode;
> extern int amdgpu_agp;
>
> extern int amdgpu_wbrf;
>+extern bool strict_numa_alloc;
>
> #define AMDGPU_VM_MAX_NUM_CTX                 4096
> #define AMDGPU_SG_THRESHOLD                   (256*1024*1024)
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>index 80b9642f2bc4..a183a6b4493d 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>@@ -781,6 +781,14 @@ int queue_preemption_timeout_ms = 9000;
> module_param(queue_preemption_timeout_ms, int, 0644);
> MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption
>timeout in ms (1 = Minimum, 9000 = default)");
>
>+/**
>+ * DOC: strict_numa_alloc(bool)
>+ * Policy to force NUMA allocation requests from the proximity NUMA domain
>only.
>+ */
>+bool strict_numa_alloc;
>+module_param(strict_numa_alloc, bool, 0444);
>+MODULE_PARM_DESC(strict_numa_alloc, "Force NUMA allocation requests
>to be satisfied from the closest node only (false = default)");
>+
> /**
>  * DOC: debug_evictions(bool)
>  * Enable extra debug messages to help determine the cause of evictions
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>index b0ed10f4de60..a9f78f85e28c 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>@@ -1768,6 +1768,7 @@ static int amdgpu_ttm_reserve_tmr(struct
>amdgpu_device *adev)
>
> static int amdgpu_ttm_pools_init(struct amdgpu_device *adev)
> {
>+      bool policy = true;
>       int i;
>
>       if (!adev->gmc.is_app_apu || !adev->gmc.num_mem_partitions)
>@@ -1779,11 +1780,15 @@ static int amdgpu_ttm_pools_init(struct
>amdgpu_device *adev)
>       if (!adev->mman.ttm_pools)
>               return -ENOMEM;
>
>+      /* Policy not only depends on the module param but also on the ASIC
>+       * setting use_strict_numa_alloc as well.
>+       */
>       for (i = 0; i < adev->gmc.num_mem_partitions; i++) {
>               ttm_pool_init(&adev->mman.ttm_pools[i], adev->dev,
>                             adev->gmc.mem_partitions[i].numa.node,
>-                            false, false);
>+                            false, false, policy && strict_numa_alloc);

why not just 'strict_numa_alloc'?

Is 'policy' used somewhere else?  Not sure this adds clarity...

>       }
>+
>       return 0;
> }
>
>diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>index 2d9cae8cd984..6ff47aac570a 100644
>--- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>+++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
>@@ -87,7 +87,7 @@ static struct ttm_pool *ttm_pool_pre_populated(struct
>kunit *test,
>       pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
>       KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);

Is this really an acceptable interface (three non-named Booleans in a row)?

I have no idea what "true, false, false" means.

Would having a "flags" parameter and then

USE_DMA_ALLOC   BIT(0)
USE_DMA32               BIT(1)
USE_STRICT_NUMA BIT(2)

Make this more readable?

Or define your Booleans:

USE_DMA_ALLOC   true
USE_DMA32               true
USE_STRICT_NUMA true

NO_DMA_ALLOC    false
NO_DMA32                false
NO_STRICT_NUMA  false

So at a minimum, we might know what these parameters are?

What is the relationship between this feature and the nid value?

Is this value used for the allocations?

If this is not NUMA_NO_NODE, would this do the same thing?
(or is the STRICT flag the only way?)

Just some thoughts,

Mike

>
>       err = ttm_pool_alloc(pool, tt, &simple_ctx);
>       KUNIT_ASSERT_EQ(test, err, 0);
>@@ -152,7 +152,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
>       KUNIT_ASSERT_NOT_NULL(test, pool);
>
>       ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, params-
>>use_dma_alloc,
>-                    false);
>+                    false, false);
>
>       KUNIT_ASSERT_PTR_EQ(test, pool->dev, devs->dev);
>       KUNIT_ASSERT_EQ(test, pool->nid, NUMA_NO_NODE);
>@@ -219,7 +219,7 @@ static void ttm_pool_alloc_basic_dma_addr(struct
>kunit *test)
>       pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
>       KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
>
>       err = ttm_pool_alloc(pool, tt, &simple_ctx);
>       KUNIT_ASSERT_EQ(test, err, 0);
>@@ -349,7 +349,7 @@ static void ttm_pool_free_dma_alloc(struct kunit
>*test)
>       pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
>       KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
>+      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
>       ttm_pool_alloc(pool, tt, &simple_ctx);
>
>       pt = &pool->caching[caching].orders[order];
>@@ -380,7 +380,7 @@ static void ttm_pool_free_no_dma_alloc(struct kunit
>*test)
>       pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
>       KUNIT_ASSERT_NOT_NULL(test, pool);
>
>-      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false);
>+      ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false, false);
>       ttm_pool_alloc(pool, tt, &simple_ctx);
>
>       pt = &pool->caching[caching].orders[order];
>diff --git a/drivers/gpu/drm/ttm/ttm_device.c
>b/drivers/gpu/drm/ttm/ttm_device.c
>index f5187b384ae9..540e8a44f015 100644
>--- a/drivers/gpu/drm/ttm/ttm_device.c
>+++ b/drivers/gpu/drm/ttm/ttm_device.c
>@@ -215,7 +215,7 @@ int ttm_device_init(struct ttm_device *bdev, const
>struct ttm_device_funcs *func
>
>       ttm_sys_man_init(bdev);
>
>-      ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc,
>use_dma32);
>+      ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc,
>use_dma32, false);
>
>       bdev->vma_manager = vma_manager;
>       spin_lock_init(&bdev->lru_lock);
>diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
>b/drivers/gpu/drm/ttm/ttm_pool.c
>index dbc96984d331..73aafd06c361 100644
>--- a/drivers/gpu/drm/ttm/ttm_pool.c
>+++ b/drivers/gpu/drm/ttm/ttm_pool.c
>@@ -447,6 +447,9 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct
>ttm_tt *tt,
>       else
>               gfp_flags |= GFP_HIGHUSER;
>
>+      if (pool->use_strict_numa_alloc)
>+              gfp_flags |= __GFP_THISNODE;
>+
>       for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
>            num_pages;
>            order = min_t(unsigned int, order, __fls(num_pages))) {
>@@ -555,7 +558,8 @@ EXPORT_SYMBOL(ttm_pool_free);
>  * Initialize the pool and its pool types.
>  */
> void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>-                 int nid, bool use_dma_alloc, bool use_dma32)
>+                 int nid, bool use_dma_alloc, bool use_dma32,
>+                 bool use_strict_numa_alloc)
> {
>       unsigned int i, j;
>
>@@ -565,6 +569,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct
>device *dev,
>       pool->nid = nid;
>       pool->use_dma_alloc = use_dma_alloc;
>       pool->use_dma32 = use_dma32;
>+      pool->use_strict_numa_alloc = use_strict_numa_alloc;
>
>       if (use_dma_alloc || nid != NUMA_NO_NODE) {
>               for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
>diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
>index 30a347e5aa11..6b7bdc952466 100644
>--- a/include/drm/ttm/ttm_pool.h
>+++ b/include/drm/ttm/ttm_pool.h
>@@ -72,6 +72,7 @@ struct ttm_pool {
>
>       bool use_dma_alloc;
>       bool use_dma32;
>+      bool use_strict_numa_alloc;
>
>       struct {
>               struct ttm_pool_type orders[MAX_ORDER + 1];
>@@ -83,7 +84,8 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt
>*tt,
> void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt);
>
> void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
>-                 int nid, bool use_dma_alloc, bool use_dma32);
>+                 int nid, bool use_dma_alloc, bool use_dma32,
>+                 bool use_strict_numa_alloc);
> void ttm_pool_fini(struct ttm_pool *pool);
>
> int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m);
>--
>2.34.1

Reply via email to