[PATCH 4/7] drm/i915: Track page table backing store usage

2023-09-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account page table backing store against the owning client memory usage
stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 13944a14ea2d..c3f2b379 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
@@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
-- 
2.39.2



[PATCH 2/7] drm/i915: Add ability for tracking buffer objects per client

2023-09-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

In order to show per client memory usage lets add some infrastructure
which enables tracking buffer objects owned by clients.

We add a per client list protected by a new per client lock and to support
delayed destruction (post client exit) we make tracked objects hold
references to the owning client.

Also, object memory region teardown is moved to the existing RCU free
callback to allow safe dereference from the fdinfo RCU read section.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +--
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 12 +++
 drivers/gpu/drm/i915/i915_drm_client.c| 36 +++
 drivers/gpu/drm/i915/i915_drm_client.h| 32 +
 4 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index c26d87555825..25eeeb863209 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
INIT_LIST_HEAD(>mm.link);
 
+#ifdef CONFIG_PROC_FS
+   INIT_LIST_HEAD(>client_link);
+#endif
+
INIT_LIST_HEAD(>lut_list);
spin_lock_init(>lut_lock);
 
@@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head)
container_of(head, typeof(*obj), rcu);
struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+   /* We need to keep this alive for RCU read access from fdinfo. */
+   if (obj->mm.n_placements > 1)
+   kfree(obj->mm.placements);
+
i915_gem_object_free(obj);
 
GEM_BUG_ON(!atomic_read(>mm.free_count));
@@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj)
if (obj->ops->release)
obj->ops->release(obj);
 
-   if (obj->mm.n_placements > 1)
-   kfree(obj->mm.placements);
-
if (obj->shares_resv_from)
i915_vm_resv_put(obj->shares_resv_from);
 
@@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object 
*gem_obj)
 
GEM_BUG_ON(i915_gem_object_is_framebuffer(obj));
 
+   i915_drm_client_remove_object(obj);
+
/*
 * Before we free the object, make sure any pure RCU-only
 * read-side critical sections are complete, e.g.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 2292404007c8..0c5cdab278b6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -302,6 +302,18 @@ struct drm_i915_gem_object {
 */
struct i915_address_space *shares_resv_from;
 
+#ifdef CONFIG_PROC_FS
+   /**
+* @client: @i915_drm_client which created the object
+*/
+   struct i915_drm_client *client;
+
+   /**
+* @client_link: Link into @i915_drm_client.objects_list
+*/
+   struct list_head client_link;
+#endif
+
union {
struct rcu_head rcu;
struct llist_node freed;
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2a44b3876cb5..2e5e69edc0f9 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void)
kref_init(>kref);
spin_lock_init(>ctx_lock);
INIT_LIST_HEAD(>ctx_list);
+#ifdef CONFIG_PROC_FS
+   spin_lock_init(>objects_lock);
+   INIT_LIST_HEAD(>objects_list);
+#endif
 
return client;
 }
@@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
show_client_class(p, i915, file_priv->client, i);
 }
+
+void i915_drm_client_add_object(struct i915_drm_client *client,
+   struct drm_i915_gem_object *obj)
+{
+   unsigned long flags;
+
+   GEM_WARN_ON(obj->client);
+   GEM_WARN_ON(!list_empty(>client_link));
+
+   spin_lock_irqsave(>objects_lock, flags);
+   obj->client = i915_drm_client_get(client);
+   list_add_tail_rcu(>client_link, >objects_list);
+   spin_unlock_irqrestore(>objects_lock, flags);
+}
+
+bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj)
+{
+   struct i915_drm_client *client = fetch_and_zero(>client);
+   unsigned long flags;
+
+   /* Object may not be associated with a client. */
+   if (!client)
+   return false;
+
+   spin_lock_irqsave(>objects_lock, flags);
+   list_del_rcu(>client_link);
+   spin_unlock_irqrestore(>objects_lock, flags);
+
+   i915_drm_client_put(client);
+

[PATCH v8 0/7] fdinfo memory stats

2023-09-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

A short series to enable fdinfo memory stats for i915.

I added tracking of most classes of objects (user objects, page tables, context
state, ring buffers) which contribute to client's memory footprint and am
accouting their memory use along the similar lines as in Rob's msm code, just
that with i915 specific code we can show a memory region breakdown and so
support discrete and multi-tile GPUs properly. And also reflect that our objects
can have multiple allowed backing stores.

The existing helper Rob added is then used to dump the per memory region stats
to fdinfo.

The basic objects-per-client infrastructure can later be extended to cover all
objects and so avoid needing to walk the IDR under the client's file table lock,
which would further avoid distburbing the running clients by parallel fdinfo
readers.

Example fdinfo format:

# cat /proc/1383/fdinfo/8
pos:0
flags:  0212
mnt_id: 21
ino:397
drm-driver: i915
drm-client-id:  18
drm-pdev:   :00:02.0
drm-total-system:   125 MiB
drm-shared-system:  16 MiB
drm-active-system:  110 MiB
drm-resident-system:125 MiB
drm-purgeable-system:   2 MiB
drm-total-stolen-system:0
drm-shared-stolen-system:   0
drm-active-stolen-system:   0
drm-resident-stolen-system: 0
drm-purgeable-stolen-system:0
drm-engine-render:  25662044495 ns
drm-engine-copy:0 ns
drm-engine-video:   0 ns
drm-engine-video-enhance:   0 ns

Example gputop output:

DRM minor 0
 PID SMEM  SMEMRSS   render copy videoNAME
1233 124M 124M |||||||| neverball
1130  59M  59M |█▌  ||||||| Xorg
1207  12M  12M |||||||| xfwm4

Or with Wayland:

DRM minor 0
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2093 191M 191M |▊  ||   ||   ||   | 
gnome-shell
DRM minor 128
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2551  71M  71M |██▉||   ||   ||   | 
neverball
2553  50M  50M |   ||   ||   ||   | 
Xwayland

Example intel_gpu_top output, aggregated mode:

intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 -   21/ 577 MHz;  71% RC6
  8 irqs/s

 ENGINES BUSY   MI_SEMA MI_WAIT
   Render/3D2.80% |▉  |  0%  0%
 Blitter0.01% |▏  |  0%  0%
   Video0.00% |   |  0%  0%
VideoEnhance0.00% |   |  0%  0%

  PID  MEM  RSS Render/3D  BlitterVideoNAME
50783 109M 107M |▎   ||||||| neverball

Region breakdown mode (needs more width for best experience):

intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 -   18/ 555 MHz;  65% RC6
  8 irqs/s

 ENGINES BUSY   MI_SEMA MI_WAIT
   Render/3D2.52% |▉  |  0%  0%
 Blitter0.00% |   |  0%  0%
   Video0.00% |   |  0%  0%
VideoEnhance0.00% |   |  0%  0%

  PID  RAM  RSS VRAM VRSS Video NAME
50783  34M  32M  75M  75M |▏  ||   ||   ||   | neverball

v2:
 * Now actually per client.

v3:
 * Track imported dma-buf objects.

v4:
 * Rely on DRM GEM handles for tracking user objects.
 * Fix internal object accounting (no placements).

v5:
 * Fixed brain fart of overwriting the loop cursor.
 * Fixed object destruction racing with fdinfo reads.
 * Take reference to GEM context while using it.

v6:
 * Rebase, cover letter update.

v7:
 * New patch in series for making region names consistent and stable.

v8:
 * New patch in series - stop losing accuracy in drm_file.c::print_size().

Test-with: 20230922134437.234888-1-tvrtko.ursu...@linux.intel.com

Tvrtko Ursulin (7):
  drm: Do not round to megabytes for greater than 1MiB sizes in fdinfo
stats
  drm/i915: Add ability for tracking buffer objects per client
  drm/i915: Record which client owns a VM
  drm/i915: Track page table backing store usage
  drm/i915: Account ring buffer and context state storage
  drm/i915: Add stable memory region names
  drm/i915: Implement fdinfo memory stats printing

 drivers/gpu/drm/drm_file.c|   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_context.c   |  11 +-
 .../gpu/drm/i915/gem/i915_gem_context_types.h |   3 +
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  13 ++-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  12 ++
 .../gpu/drm/i915/gem/selftests/mock_context.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_context.c   |  14 +++
 drivers

[PATCH 1/7] drm: Do not round to megabytes for greater than 1MiB sizes in fdinfo stats

2023-09-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

It is better not to lose precision and not revert to 1 MiB size
granularity for every size greater than 1 MiB.

Sizes in KiB should not be so troublesome to read (and in fact machine
parsing is I expect the norm here), they align with other api like
/proc/meminfo, and they allow writing tests for the interface without
having to embed drm.ko implementation knowledge into them. (Like knowing
that minimum buffer size one can use for successful verification has to be
1MiB aligned, and on top account for any pre-existing memory utilisation
outside of driver's control.)

But probably even more importantly I think that it is just better to show
the accurate sizes and not arbitrary lose precision for a little bit of a
stretched use case of eyeballing fdinfo text directly.

Signed-off-by: Tvrtko Ursulin 
Cc: Rob Clark 
Cc: Adrián Larumbe 
Cc: steven.pr...@arm.com
---
 drivers/gpu/drm/drm_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index e692770ef6d3..ecb5038009e7 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -913,7 +913,7 @@ static void print_size(struct drm_printer *p, const char 
*stat,
unsigned u;
 
for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
-   if (sz < SZ_1K)
+   if (sz == 0 || !IS_ALIGNED(sz, SZ_1K))
break;
sz = div_u64(sz, SZ_1K);
}
-- 
2.39.2



Re: [PATCH 6/6] drm/i915: Implement fdinfo memory stats printing

2023-09-27 Thread Tvrtko Ursulin



On 27/09/2023 14:23, Tvrtko Ursulin wrote:


On 27/09/2023 07:54, Andi Shyti wrote:

Hi Tvrtko,


Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

v2:
  * Only account against the active region.
  * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas)

v3:
  * Update commit text. (Aravind)
  * Update to use memory regions uabi names.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
Cc: Andi Shyti 
Cc: Tejas Upadhyay 
Reviewed-by: Andi Shyti  # v1
Reviewed-by: Aravind Iddamsetty  # v2


Reviewed-by: Andi Shyti 


Thanks guys, just the IGTs remaining now. I've just sent a respin of one 
patch in that series which will hopefully fix things up.


Actually no, I forgot that decided I will respin the i915 series with 
yet one more patch. Stay tuned please.


Regards,

Tvrtko


* https://patchwork.freedesktop.org/series/124118/

First two patches is what we need to merge the kernel side, while the 
rest are intel_gpu_top fixes followed by per client memory support.


Regards,

Tvrtko



Re: [PATCH 6/6] drm/i915: Implement fdinfo memory stats printing

2023-09-27 Thread Tvrtko Ursulin



On 27/09/2023 07:54, Andi Shyti wrote:

Hi Tvrtko,


Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

v2:
  * Only account against the active region.
  * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas)

v3:
  * Update commit text. (Aravind)
  * Update to use memory regions uabi names.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
Cc: Andi Shyti 
Cc: Tejas Upadhyay 
Reviewed-by: Andi Shyti  # v1
Reviewed-by: Aravind Iddamsetty  # v2


Reviewed-by: Andi Shyti 


Thanks guys, just the IGTs remaining now. I've just sent a respin of one 
patch in that series which will hopefully fix things up.


* https://patchwork.freedesktop.org/series/124118/

First two patches is what we need to merge the kernel side, while the 
rest are intel_gpu_top fixes followed by per client memory support.


Regards,

Tvrtko



Re: [PATCH v2] drm/i915: Do not disable preemption for resets

2023-09-27 Thread Tvrtko Ursulin



On 26/09/2023 11:26, Andi Shyti wrote:

Hi Tvrtko,

On Tue, Sep 26, 2023 at 11:08:55AM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a
preempt disable section over the hardware reset callback to prepare the
driver for being able to reset from atomic contexts.

In retrospect I can see that the work item at a time was about removing
the struct mutex from the reset path. Code base also briefly entertained
the idea of doing the reset under stop_machine in order to serialize
userspace mmap and temporary glitch in the fence registers (see
eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"),
but that never materialized and was soon removed in 2caffbf11762
("drm/i915: Revoke mmaps and prevent access to fence registers across
reset") and replaced with a SRCU based solution.

As such, as far as I can see, today we still have a requirement that
resets must not sleep (invoked from submission tasklets), but no need to
support invoking them from a truly atomic context.

Given that the preemption section is problematic on RT kernels, since the
uncore lock becomes a sleeping lock and so is invalid in such section,
lets try and remove it. Potential downside is that our short waits on GPU
to complete the reset may get extended if CPU scheduling interferes, but
in practice that probably isn't a deal breaker.

In terms of mechanics, since the preemption disabled block is being
removed we just need to replace a few of the wait_for_atomic macros into
busy looping versions which will work (and not complain) when called from
non-atomic sections.

v2:
  * Fix timeouts which are now in us. (Andi)
  * Update one comment as a drive by. (Andi)

Signed-off-by: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Paul Gortmaker 
Cc: Sebastian Andrzej Siewior 
Cc: Andi Shyti 


Reviewed-by: Andi Shyti 


Thank you, pushed to drm-intel-gt-next!

Regards,

Tvrtko


Re: [Intel-gfx] [Patch v1] drm/i915: Add uAPI to query micro-controller FW version

2023-09-27 Thread Tvrtko Ursulin



On 27/09/2023 05:14, Balasubrawmanian, Vivaik wrote:
Due to a bug in GuC firmware, Mesa can't enable by default the usage of 
compute engines in DG2 and newer.



A new GuC firmware fixed the issue but until now there was no way

for Mesa to know if KMD was running with the fixed GuC version or not,

so this uAPI is required.


Is the firmware bug making the ccs engines generally useless, or just 
not suitable for this specific Mesa use case?



It may be expanded in future to query other firmware versions too.

More information: 
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23661


Mesa usage: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25233


Cc: John Harrison 

Cc: Daniele Ceraolo Spurio 

Cc: José Roberto de Souza 

Signed-off-by: Vivaik Balasubrawmanian 
---
  drivers/gpu/drm/i915/i915_query.c | 47 +++
  include/uapi/drm/i915_drm.h   | 32 +
  2 files changed, 79 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c

index 00871ef99792..7f22a49faae7 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -551,6 +551,52 @@ static int query_hwconfig_blob(struct 
drm_i915_private *i915,

  return hwconfig->size;
  }

+static int
+query_uc_fw_version(struct drm_i915_private *i915, struct 
drm_i915_query_item *query)

+{
+    struct drm_i915_query_uc_fw_version __user *query_ptr = 
u64_to_user_ptr(query->data_ptr);

+    size_t size = sizeof(struct drm_i915_query_uc_fw_version);
+    struct drm_i915_query_uc_fw_version resp;
+
+    if (query->length == 0) {
+        query->length = size;
+        return 0;
+    } else if (query->length != size) {
+        drm_dbg(>drm,
+            "Invalid uc_fw_version query item size=%u expected=%zu\n",
+            query->length,    size);
+        return -EINVAL;
+    }
+
+    if (copy_from_user(, query_ptr, size))
+        return -EFAULT;


The above can probably be replaced by using the copy_query_item() helper 
and it would work a bit better even since no reason to reject a buffer 
too large.



+
+    if (resp.pad || resp.pad2 || resp.reserved) {
+        drm_dbg(>drm,
+            "Invalid input fw version query structure parameters 
received");

+        return -EINVAL;
+    }
+
+    switch (resp.uc_type) {
+    case I915_QUERY_UC_TYPE_GUC: {
+        struct intel_guc *guc = >gt0.uc.guc;
+
+        resp.major_ver = guc->submission_version.major;
+        resp.minor_ver = guc->submission_version.minor;
+        resp.patch_ver = guc->submission_version.patch;


Submission version is not the same as fw version, right? So 
DRM_I915_QUERY_UC_FW_VERSION and uapi kerneldoc is misleading.


Name the query type I915_QUERY_UC_TYPE_GUC*_SUBMISSION* and make it clear?

Regards,

Tvrtko


+        resp.branch_ver = 0;
+        break;
+    }
+    default:
+        return -EINVAL;
+    }
+
+    if (copy_to_user(query_ptr, , size))
+        return -EFAULT;
+
+    return 0;
+}
+
  static int (* const i915_query_funcs[])(struct drm_i915_private 
*dev_priv,

                  struct drm_i915_query_item *query_item) = {
  query_topology_info,
@@ -559,6 +605,7 @@ static int (* const i915_query_funcs[])(struct 
drm_i915_private *dev_priv,

  query_memregion_info,
  query_hwconfig_blob,
  query_geometry_subslices,
+    query_uc_fw_version,
  };

  int i915_query_ioctl(struct drm_device *dev, void *data, struct 
drm_file *file)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7000e5910a1d..9be241fb77d8 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3013,6 +3013,7 @@ struct drm_i915_query_item {
   *  - %DRM_I915_QUERY_MEMORY_REGIONS (see struct 
drm_i915_query_memory_regions)

   *  - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`)
   *  - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct 
drm_i915_query_topology_info)
+     *  - %DRM_I915_QUERY_UC_FW_VERSION (see struct 
drm_i915_query_uc_fw_version)

   */
  __u64 query_id;
  #define DRM_I915_QUERY_TOPOLOGY_INFO        1
@@ -3021,6 +3022,7 @@ struct drm_i915_query_item {
  #define DRM_I915_QUERY_MEMORY_REGIONS        4
  #define DRM_I915_QUERY_HWCONFIG_BLOB        5
  #define DRM_I915_QUERY_GEOMETRY_SUBSLICES    6
+#define DRM_I915_QUERY_UC_FW_VERSION    7
  /* Must be kept compact -- no holes and well documented */

  /**
@@ -3213,6 +3215,36 @@ struct drm_i915_query_topology_info {
  __u8 data[];
  };

+/**
+* struct drm_i915_query_uc_fw_version - query a micro-controller 
firmware version

+*
+* Given a uc_type this will return the major, minor, patch and branch 
version

+* of the micro-controller firmware.
+*/
+struct drm_i915_query_uc_fw_version {
+    /** @uc: The micro-controller type to query firmware version */
+#define I915_QUERY_UC_TYPE_GUC 0
+    __u16 uc_type;
+
+    /** @pad: MBZ */
+    __u16 pad;
+
+    /* @major_ver: major uc fw 

Re: [Intel-gfx] [PATCH v4 3/3] drm/i915/gt: Timeout when waiting for idle in suspending

2023-09-27 Thread Tvrtko Ursulin



On 26/09/2023 20:05, Alan Previn wrote:

When suspending, add a timeout when calling
intel_gt_pm_wait_for_idle else if we have a lost
G2H event that holds a wakeref (which would be
indicative of a bug elsewhere in the driver),
driver will at least complete the suspend-resume
cycle, (albeit not hitting all the targets for
low power hw counters), instead of hanging in the kernel.

Signed-off-by: Alan Previn 
Reviewed-by: Rodrigo Vivi 
Tested-by: Mousumi Jana 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |  6 +-
  drivers/gpu/drm/i915/gt/intel_gt_pm.h |  7 ++-
  drivers/gpu/drm/i915/intel_wakeref.c  | 14 ++
  drivers/gpu/drm/i915/intel_wakeref.h  |  6 --
  5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 84a75c95f3f7..9c6151b78e1d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -687,7 +687,7 @@ void intel_engines_release(struct intel_gt *gt)
if (!engine->release)
continue;
  
-		intel_wakeref_wait_for_idle(>wakeref);

+   intel_wakeref_wait_for_idle(>wakeref, 0);
GEM_BUG_ON(intel_engine_pm_is_awake(engine));
  
  		engine->release(engine);

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 59b5658a17fb..820217c06dc7 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -289,6 +289,7 @@ int intel_gt_resume(struct intel_gt *gt)
  
  static void wait_for_suspend(struct intel_gt *gt)

  {
+   int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 1;


CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT is in ns so assigning it to _ms is 
a bit to arbitrary.


Why not the existing I915_GT_SUSPEND_IDLE_TIMEOUT for instance?


/*
 * On rare occasions, we've observed the fence completion trigger
 * free_engines asynchronously via rcu_call. Ensure those are done.
@@ -308,7 +309,10 @@ static void wait_for_suspend(struct intel_gt *gt)
intel_gt_retire_requests(gt);
}
  
-	intel_gt_pm_wait_for_idle(gt);

+   /* we are suspending, so we shouldn't be waiting forever */
+   if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT)
+   gt_warn(gt, "bailing from %s after %d milisec timeout\n",
+   __func__, timeout_ms);


Does the timeout in intel_gt_pm_wait_timeout_for_idle always comes in 
pair with the timeout first in intel_gt_wait_for_idle?


Also, is the timeout here hit from the intel_gt_suspend_prepare, 
intel_gt_suspend_late, or can be both?


Main concern is that we need to be sure there are no possible 
ill-effects, like letting the GPU/GuC scribble on some memory we 
unmapped (or will unmap), having let the suspend continue after timing 
out, and not perhaps doing the forced wedge like wait_for_suspend() does 
on the existing timeout path.


Would it be possible to handle the lost G2H events directly in the 
respective component instead of here? Like apply the timeout during the 
step which explicitly idles the CT for suspend (presumably that 
exists?), and so cleanup from there once declared a lost event.


Regards,

Tvrtko


  }
  
  void intel_gt_suspend_prepare(struct intel_gt *gt)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index 6c9a46452364..5358acc2b5b1 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -68,7 +68,12 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
  
  static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)

  {
-   return intel_wakeref_wait_for_idle(>wakeref);
+   return intel_wakeref_wait_for_idle(>wakeref, 0);
+}
+
+static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int 
timeout_ms)
+{
+   return intel_wakeref_wait_for_idle(>wakeref, timeout_ms);
  }
  
  void intel_gt_pm_init_early(struct intel_gt *gt);

diff --git a/drivers/gpu/drm/i915/intel_wakeref.c 
b/drivers/gpu/drm/i915/intel_wakeref.c
index 718f2f1b6174..383a37521415 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -111,14 +111,20 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 "wakeref.work", >work, 0);
  }
  
-int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)

+int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms)
  {
-   int err;
+   int err = 0;
  
  	might_sleep();
  
-	err = wait_var_event_killable(>wakeref,

- !intel_wakeref_is_active(wf));
+   if (!timeout_ms)
+   err = wait_var_event_killable(>wakeref,
+ !intel_wakeref_is_active(wf));
+   else if 

Re: [PATCH 5/6] drm/i915: Add stable memory region names

2023-09-26 Thread Tvrtko Ursulin



On 26/09/2023 16:29, Iddamsetty, Aravind wrote:

On 22-09-2023 19:16, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

At the moment memory region names are a bit too varied and too
inconsistent to be used for ABI purposes, like for upcoming fdinfo
memory stats.

System memory can be either system or system-ttm. Local memory has the
instance number appended, others do not. Not only incosistent but thi
kind of implementation detail is uninteresting for intended users of
fdinfo memory stats.

Add a stable name always formed as $type$instance. Could have chosen a
different stable scheme, but I think any consistent and stable scheme
should do just fine.

Signed-off-by: Tvrtko Ursulin 
---
  drivers/gpu/drm/i915/intel_memory_region.c | 19 +++
  drivers/gpu/drm/i915/intel_memory_region.h |  1 +
  2 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 3d1fdea9811d..60a03340bbd4 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -216,6 +216,22 @@ static int intel_memory_region_memtest(struct 
intel_memory_region *mem,
return err;
  }
  
+static const char *region_type_str(u16 type)

+{
+   switch (type) {
+   case INTEL_MEMORY_SYSTEM:
+   return "system";
+   case INTEL_MEMORY_LOCAL:
+   return "local";
+   case INTEL_MEMORY_STOLEN_LOCAL:
+   return "stolen-local";
+   case INTEL_MEMORY_STOLEN_SYSTEM:
+   return "stolen-system";
+   default:
+   return "unknown";
+   }
+}
+
  struct intel_memory_region *
  intel_memory_region_create(struct drm_i915_private *i915,
   resource_size_t start,
@@ -244,6 +260,9 @@ intel_memory_region_create(struct drm_i915_private *i915,
mem->type = type;
mem->instance = instance;
  
+	snprintf(mem->uabi_name, sizeof(mem->uabi_name), "%s%u",

+region_type_str(type), instance);
+
mutex_init(>objects.lock);
INIT_LIST_HEAD(>objects.list);
  
diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h

index 2953ed5c3248..9ba36454e51b 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.h
+++ b/drivers/gpu/drm/i915/intel_memory_region.h
@@ -80,6 +80,7 @@ struct intel_memory_region {
u16 instance;
enum intel_region_id id;
char name[16];
+   char uabi_name[16];


Just a thought instead of creating a new field, can't we derive this
with name and instance?


I'd rather not snprintf on every fdinfo read - for every pid and every 
drm fd versus 2-3 strings kept around.


I did briefly wonder if mr->name could be dropped, that is renamed to 
mr->uabi_name, but I guess there is some value to print the internal 
name in some log messages, to leave a trace of what underlying 
implementation is used. Although I am not too sure about the value of 
that either since it is implied from the kernel version.


Then on top the usage in i915_gem_create/repr_name I could replace with 
mr->uabi_name and simplify. If there is any value in printing the name 
there, versus just uabi type:instance integers. Dunno. All I know is 
fdinfo should have stable names and not confuse with implementation 
details so I need something..


Regards,

Tvrtko


Re: [Intel-gfx] [PATCH] drm/i915: Do not disable preemption for resets

2023-09-26 Thread Tvrtko Ursulin



On 26/09/2023 10:18, Andi Shyti wrote:

Hi Tvrtko,


Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a
preempt disable section over the hardware reset callback to prepare the
driver for being able to reset from atomic contexts.

In retrospect I can see that the work item at a time was about removing
the struct mutex from the reset path. Code base also briefly entertained
the idea of doing the reset under stop_machine in order to serialize
userspace mmap and temporary glitch in the fence registers (see
eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"),
but that never materialized and was soon removed in 2caffbf11762
("drm/i915: Revoke mmaps and prevent access to fence registers across
reset") and replaced with a SRCU based solution.

As such, as far as I can see, today we still have a requirement that
resets must not sleep (invoked from submission tasklets), but no need to
support invoking them from a truly atomic context.

Given that the preemption section is problematic on RT kernels, since the
uncore lock becomes a sleeping lock and so is invalid in such section,
lets try and remove it. Potential downside is that our short waits on GPU
to complete the reset may get extended if CPU scheduling interferes, but
in practice that probably isn't a deal breaker.

In terms of mechanics, since the preemption disabled block is being
removed we just need to replace a few of the wait_for_atomic macros into
busy looping versions which will work (and not complain) when called from
non-atomic sections.


looks reasonable, few unrelated questions


---
  drivers/gpu/drm/i915/gt/intel_reset.c | 12 +---
  1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index e2152f75ba2e..6916eba3bd33 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -167,13 +167,13 @@ static int i915_do_reset(struct intel_gt *gt,
/* Assert reset for at least 20 usec, and wait for acknowledgement. */


is this /20/50/ ?


Unrelated change but okay.




pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
udelay(50);
-   err = wait_for_atomic(i915_in_reset(pdev), 50);
+   err = _wait_for_atomic(i915_in_reset(pdev), 50, 0);


wait_for_atomic() waits in milliseconds, while _wait_for_atomic()
waits in microseconds, I think you need to update the timer.


Ah.. well spotted!


Do you think we might need a wait_for_atomic_preempt() macro?

err = wait_for_atomic_preempt(i915_in_reset(pdev), 50);


I don't see what it would do? _wait_for_atomic when ATOMIC == 0 already 
enables preemption. To allow passing in milliseconds? I fear one more 
macro would create more confusion.


Regards,

Tvrtko


[PATCH v2] drm/i915: Do not disable preemption for resets

2023-09-26 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a
preempt disable section over the hardware reset callback to prepare the
driver for being able to reset from atomic contexts.

In retrospect I can see that the work item at a time was about removing
the struct mutex from the reset path. Code base also briefly entertained
the idea of doing the reset under stop_machine in order to serialize
userspace mmap and temporary glitch in the fence registers (see
eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"),
but that never materialized and was soon removed in 2caffbf11762
("drm/i915: Revoke mmaps and prevent access to fence registers across
reset") and replaced with a SRCU based solution.

As such, as far as I can see, today we still have a requirement that
resets must not sleep (invoked from submission tasklets), but no need to
support invoking them from a truly atomic context.

Given that the preemption section is problematic on RT kernels, since the
uncore lock becomes a sleeping lock and so is invalid in such section,
lets try and remove it. Potential downside is that our short waits on GPU
to complete the reset may get extended if CPU scheduling interferes, but
in practice that probably isn't a deal breaker.

In terms of mechanics, since the preemption disabled block is being
removed we just need to replace a few of the wait_for_atomic macros into
busy looping versions which will work (and not complain) when called from
non-atomic sections.

v2:
 * Fix timeouts which are now in us. (Andi)
 * Update one comment as a drive by. (Andi)

Signed-off-by: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Paul Gortmaker 
Cc: Sebastian Andrzej Siewior 
Cc: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 98575d79c446..a21e939fdbf6 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -161,16 +161,16 @@ static int i915_do_reset(struct intel_gt *gt,
struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
int err;
 
-   /* Assert reset for at least 20 usec, and wait for acknowledgement. */
+   /* Assert reset for at least 50 usec, and wait for acknowledgement. */
pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
udelay(50);
-   err = wait_for_atomic(i915_in_reset(pdev), 50);
+   err = _wait_for_atomic(i915_in_reset(pdev), 5, 0);
 
/* Clear the reset request. */
pci_write_config_byte(pdev, I915_GDRST, 0);
udelay(50);
if (!err)
-   err = wait_for_atomic(!i915_in_reset(pdev), 50);
+   err = _wait_for_atomic(!i915_in_reset(pdev), 5, 0);
 
return err;
 }
@@ -190,7 +190,7 @@ static int g33_do_reset(struct intel_gt *gt,
struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
 
pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-   return wait_for_atomic(g4x_reset_complete(pdev), 50);
+   return _wait_for_atomic(g4x_reset_complete(pdev), 5, 0);
 }
 
 static int g4x_do_reset(struct intel_gt *gt,
@@ -207,7 +207,7 @@ static int g4x_do_reset(struct intel_gt *gt,
 
pci_write_config_byte(pdev, I915_GDRST,
  GRDOM_MEDIA | GRDOM_RESET_ENABLE);
-   ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
+   ret =  _wait_for_atomic(g4x_reset_complete(pdev), 5, 0);
if (ret) {
GT_TRACE(gt, "Wait for media reset failed\n");
goto out;
@@ -215,7 +215,7 @@ static int g4x_do_reset(struct intel_gt *gt,
 
pci_write_config_byte(pdev, I915_GDRST,
  GRDOM_RENDER | GRDOM_RESET_ENABLE);
-   ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
+   ret =  _wait_for_atomic(g4x_reset_complete(pdev), 5, 0);
if (ret) {
GT_TRACE(gt, "Wait for render reset failed\n");
goto out;
@@ -785,9 +785,7 @@ int __intel_gt_reset(struct intel_gt *gt, 
intel_engine_mask_t engine_mask)
reset_mask = wa_14015076503_start(gt, engine_mask, !retry);
 
GT_TRACE(gt, "engine_mask=%x\n", reset_mask);
-   preempt_disable();
ret = reset(gt, reset_mask, retry);
-   preempt_enable();
 
wa_14015076503_end(gt, reset_mask);
}
-- 
2.39.2



Re: [Intel-gfx] [PATCH] drm/i915: Zap some empty lines

2023-09-25 Thread Tvrtko Ursulin



On 25/09/2023 15:14, Andi Shyti wrote:

Hi Tvrtko,

On Wed, Sep 20, 2023 at 09:57:15AM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Recent refactoring left an unsightly block of empty lines. Remove them.

Signed-off-by: Tvrtko Ursulin 
Cc: Dnyaneshwar Bhadane 
Cc: Anusha Srivatsa 
Cc: Radhakrishna Sripada 


as this isn't merged yet:

Reviewed-by: Andi Shyti 


Thanks, I am catching up with things and this wasn't so important. If 
you have a spare moment feel free to push it?


Regards,

Tvrtko


[RFC] drm/i915: Allow dmabuf mmap forwarding

2023-09-25 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Allow mmap forwarding for imported buffers in order to allow minigbm mmap
to work on aperture-less platforms such as Meteorlake.

So far i915 did not allow mmap on imported buffers but from minigbm
perspective that worked because of the DRM_IOCTL_I915_GEM_MMAP_GTT fall-
back would then be attempted, and would be successful.

This stops working on Meteorlake since there is no aperture.

Allow i915 to mmap imported buffers using forwarding via dma_buf_mmap(),
which allows the primary minigbm path of DRM_IOCTL_I915_GEM_MMAP_OFFSET /
I915_MMAP_OFFSET_WB to work.

Signed-off-by: Tvrtko Ursulin 
Cc: Daniel Vetter 
Cc: Christian König 
Cc: Matthew Auld 
Cc: Nirmoy Das 
---
1)
It is unclear to me if any real userspace depends on this, but there are
certainly compliance suites which fail.

2)
It is also a bit unclear to me if dma_buf_mmap() is exactly intended for
this kind of use. It seems that it is, but I also found some old mailing
list discussions suggesting there might be some unresolved questions
around VMA revocation.

1 + 2 = RFC for now.

Daniel and Christian were involved in 2) in the past so comments would
be appreciated.

Test-with: 20230925131539.32743-1-tvrtko.ursu...@linux.intel.com

---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  | 78 +++
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  1 +
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index aa4d842d4c5a..78c84c0a8b08 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -5,6 +5,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -664,6 +665,7 @@ insert_mmo(struct drm_i915_gem_object *obj, struct 
i915_mmap_offset *mmo)
 static struct i915_mmap_offset *
 mmap_offset_attach(struct drm_i915_gem_object *obj,
   enum i915_mmap_type mmap_type,
+  bool forward_mmap,
   struct drm_file *file)
 {
struct drm_i915_private *i915 = to_i915(obj->base.dev);
@@ -682,6 +684,7 @@ mmap_offset_attach(struct drm_i915_gem_object *obj,
 
mmo->obj = obj;
mmo->mmap_type = mmap_type;
+   mmo->forward_mmap = forward_mmap;
drm_vma_node_reset(>vma_node);
 
err = drm_vma_offset_add(obj->base.dev->vma_offset_manager,
@@ -714,12 +717,25 @@ mmap_offset_attach(struct drm_i915_gem_object *obj,
return ERR_PTR(err);
 }
 
+static bool
+should_forward_mmap(struct drm_i915_gem_object *obj,
+   enum i915_mmap_type mmap_type)
+{
+   if (!obj->base.import_attach)
+   return false;
+
+   return mmap_type == I915_MMAP_TYPE_WB ||
+  mmap_type == I915_MMAP_TYPE_WC ||
+  mmap_type == I915_MMAP_TYPE_UC;
+}
+
 static int
 __assign_mmap_offset(struct drm_i915_gem_object *obj,
 enum i915_mmap_type mmap_type,
 u64 *offset, struct drm_file *file)
 {
struct i915_mmap_offset *mmo;
+   bool should_forward;
 
if (i915_gem_object_never_mmap(obj))
return -ENODEV;
@@ -735,12 +751,15 @@ __assign_mmap_offset(struct drm_i915_gem_object *obj,
if (mmap_type == I915_MMAP_TYPE_FIXED)
return -ENODEV;
 
+   should_forward = should_forward_mmap(obj, mmap_type);
+
if (mmap_type != I915_MMAP_TYPE_GTT &&
!i915_gem_object_has_struct_page(obj) &&
-   !i915_gem_object_has_iomem(obj))
+   !i915_gem_object_has_iomem(obj) &&
+   !should_forward)
return -ENODEV;
 
-   mmo = mmap_offset_attach(obj, mmap_type, file);
+   mmo = mmap_offset_attach(obj, mmap_type, should_forward, file);
if (IS_ERR(mmo))
return PTR_ERR(mmo);
 
@@ -936,6 +955,32 @@ static struct file *mmap_singleton(struct drm_i915_private 
*i915)
return file;
 }
 
+static void
+__vma_mmap_pgprot(struct vm_area_struct *vma, enum i915_mmap_type mmap_type)
+{
+   const pgprot_t pgprot =vm_get_page_prot(vma->vm_flags);
+
+   switch (mmap_type) {
+   case I915_MMAP_TYPE_WC:
+   vma->vm_page_prot = pgprot_writecombine(pgprot);
+   break;
+   case I915_MMAP_TYPE_FIXED:
+   GEM_WARN_ON(1);
+   fallthrough;
+   case I915_MMAP_TYPE_WB:
+   vma->vm_page_prot = pgprot;
+   break;
+   case I915_MMAP_TYPE_UC:
+   vma->vm_page_prot = pgprot_noncached(pgprot);
+   break;
+   case I915_MMAP_TYPE_GTT:
+   vma->vm_page_prot = pgprot_writecombine(pgprot);
+   break;
+   }
+
+   vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+}
+
 static int
 i915_gem_object_mmap(struct drm_i915_gem_object *obj,
 struct i915_mmap_offset *mmo,
@@ -953,6 +998,20 @@ i915_gem_object_mma

Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics

2023-09-25 Thread Tvrtko Ursulin



On 22/09/2023 16:23, Steven Price wrote:

On 22/09/2023 14:53, Tvrtko Ursulin wrote:


On 22/09/2023 11:57, Adrián Larumbe wrote:

On 20.09.2023 16:40, Tvrtko Ursulin wrote:

On 20/09/2023 00:34, Adrián Larumbe wrote:

The drm-stats fdinfo tags made available to user space are drm-engine,
drm-cycles, drm-max-freq and drm-curfreq, one per job slot.

This deviates from standard practice in other DRM drivers, where a
single
set of key:value pairs is provided for the whole render engine.
However,
Panfrost has separate queues for fragment and vertex/tiler jobs, so a
decision was made to calculate bus cycles and workload times
separately.

Maximum operating frequency is calculated at devfreq initialisation
time.
Current frequency is made available to user space because nvtop uses it
when performing engine usage calculations.

It is important to bear in mind that both GPU cycle and kernel time
numbers
provided are at best rough estimations, and always reported in
excess from
the actual figure because of two reasons:
    - Excess time because of the delay between the end of a job
processing,
  the subsequent job IRQ and the actual time of the sample.
    - Time spent in the engine queue waiting for the GPU to pick up
the next
  job.

To avoid race conditions during enablement/disabling, a reference
counting
mechanism was introduced, and a job flag that tells us whether a
given job
increased the refcount. This is necessary, because user space can
toggle
cycle counting through a debugfs file, and a given job might have
been in
flight by the time cycle counting was disabled.

The main goal of the debugfs cycle counter knob is letting tools
like nvtop
or IGT's gputop switch it at any time, to avoid power waste in case no
engine usage measuring is necessary.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
    drivers/gpu/drm/panfrost/Makefile   |  2 +
    drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 
    drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 +
    drivers/gpu/drm/panfrost/panfrost_devfreq.c |  8 +++
    drivers/gpu/drm/panfrost/panfrost_devfreq.h |  3 ++
    drivers/gpu/drm/panfrost/panfrost_device.c  |  2 +
    drivers/gpu/drm/panfrost/panfrost_device.h  | 13 +
    drivers/gpu/drm/panfrost/panfrost_drv.c | 57
-
    drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++
    drivers/gpu/drm/panfrost/panfrost_gpu.h |  4 ++
    drivers/gpu/drm/panfrost/panfrost_job.c | 24 +
    drivers/gpu/drm/panfrost/panfrost_job.h |  5 ++
    12 files changed, 191 insertions(+), 1 deletion(-)
    create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c
    create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h

diff --git a/drivers/gpu/drm/panfrost/Makefile
b/drivers/gpu/drm/panfrost/Makefile
index 7da2b3f02ed9..2c01c1e7523e 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -12,4 +12,6 @@ panfrost-y := \
    panfrost_perfcnt.o \
    panfrost_dump.o
+panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o
+
    obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c
b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
new file mode 100644
index ..cc14eccba206
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2023 Collabora ltd. */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "panfrost_device.h"
+#include "panfrost_gpu.h"
+#include "panfrost_debugfs.h"
+
+void panfrost_debugfs_init(struct drm_minor *minor)
+{
+    struct drm_device *dev = minor->dev;
+    struct panfrost_device *pfdev =
platform_get_drvdata(to_platform_device(dev->dev));
+
+    debugfs_create_atomic_t("profile", 0600, minor->debugfs_root,
>profile_mode);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h
b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
new file mode 100644
index ..db1c158bcf2f
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2023 Collabora ltd.
+ */
+
+#ifndef PANFROST_DEBUGFS_H
+#define PANFROST_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+void panfrost_debugfs_init(struct drm_minor *minor);
+#endif
+
+#endif  /* PANFROST_DEBUGFS_H */
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 58dfb15a8757..28caffc689e2 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct
device *dev,
    spin_lock_irqsave(>lock, irqflags);
    panfrost_devfreq_update_utilization(pfdevfreq);
+    pfdevfreq->current_frequency = status->current_frequency;
    st

Re: [Intel-gfx] [PATCH 2/3] drm/i915/mtl: Add a PMU counter for total active ticks

2023-09-25 Thread Tvrtko Ursulin



On 22/09/2023 23:25, john.c.harri...@intel.com wrote:

From: Umesh Nerlige Ramappa 

Current engine busyness interface exposed by GuC has a few issues:

- The busyness of active engine is calculated using 2 values provided by
   GuC and is prone to race between CPU reading those values and GuC
   updating them. Any sort of HW synchronization would be at the cost of
   scheduling latencies.

- GuC provides only 32 bit values for busyness and KMD has to run a
   worker to extend the values to 64 bit. In addition KMD also needs to
   extend the GT timestamp to 64 bits so that it can be used to calculate
   active busyness for an engine.

To address these issues, GuC provides a new interface to calculate
engine busyness. GuC accumulates the busyness ticks in a 64 bit value
and also internally updates the busyness for an active context using a
periodic timer. This simplifies the KMD implementation such that KMD
only needs to relay the busyness value to the user.

In addition to fixing the interface, GuC also provides a periodically
total active ticks that the GT has been running for. This counter is
exposed to the user so that the % busyness can be calculated as follows:

busyness % = (engine active ticks/total active ticks) * 100.


AFAIU I915_PMU_TOTAL_ACTIVE_TICKS only runs when GT is awake, right?

So if GT is awake 10% of the time, and engine is busy that 100% of that 
time, which is 10% of the real/wall time, the busyness by this formula 
comes up as 100%. Which wouldn't be useful for intel_gpu_top and alike.


How to scale it back to wall time? Again AFAIU there is no info about 
tick frequency, so how does one know what a delta in total active ticks 
means?


Going back on the higher level, I am not convinced we need to add a new 
uapi just for MTL. If the tick period is known internally we could just 
use v2 internally and expose the current uapi using it.


Any timebase conversion error is unlikely to be relevant because 
userspace only looks at deltas over relatively short periods (seconds). 
Ie. I don't think that the clock drift error would accumulate so it 
would need to be really huge to be relevant over short sampling periods.


Regards,

Tvrtko



Implement the new interface and start by adding a new counter for total
active ticks.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: John Harrison 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 24 +++
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  1 +
  drivers/gpu/drm/i915/i915_pmu.c   |  6 +
  include/uapi/drm/i915_drm.h   |  2 ++
  4 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 88465d701c278..0c1fee5360777 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1607,6 +1607,30 @@ static ktime_t busy_v2_guc_engine_busyness(struct 
intel_engine_cs *engine, ktime
return ns_to_ktime(total);
  }
  
+static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc)

+{
+   return guc->busy.v1.gt_stamp;
+}
+
+static u64 busy_v2_intel_guc_total_active_ticks(struct intel_guc *guc)
+{
+   u64 ticks_gt;
+
+   __busy_v2_get_engine_usage_record(guc, NULL, NULL, NULL, _gt);
+
+   return ticks_gt;
+}
+
+u64 intel_guc_total_active_ticks(struct intel_gt *gt)
+{
+   struct intel_guc *guc = >uc.guc;
+
+   if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1))
+   return busy_v1_intel_guc_total_active_ticks(guc);
+   else
+   return busy_v2_intel_guc_total_active_ticks(guc);
+}
+
  static int busy_v2_guc_action_enable_usage_stats_device(struct intel_guc *guc)
  {
u32 offset = guc_engine_usage_offset_v2_device(guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
index c57b29cdb1a64..f6d42838825f2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
@@ -30,6 +30,7 @@ void intel_guc_dump_active_requests(struct intel_engine_cs 
*engine,
struct drm_printer *m);
  void intel_guc_busyness_park(struct intel_gt *gt);
  void intel_guc_busyness_unpark(struct intel_gt *gt);
+u64 intel_guc_total_active_ticks(struct intel_gt *gt);
  
  bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
  
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c

index d35973b411863..4f52636eb4a80 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -563,6 +563,8 @@ config_status(struct drm_i915_private *i915, u64 config)
break;
case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
break;
+   case I915_PMU_TOTAL_ACTIVE_TICKS:
+   break;
default:
return -ENOENT;
}
@@ 

Re: [Intel-gfx] [PATCH 3/3] drm/i915/mtl: Add counters for engine busyness ticks

2023-09-25 Thread Tvrtko Ursulin



On 22/09/2023 23:25, john.c.harri...@intel.com wrote:

From: Umesh Nerlige Ramappa 

In new version of GuC engine busyness, GuC provides engine busyness
ticks as a 64 bit counter. Add a new counter to relay this value to the
user as is.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/intel_engine.h|  1 +
  drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +
  drivers/gpu/drm/i915/gt/intel_engine_types.h  | 12 
  drivers/gpu/drm/i915/gt/intel_engine_user.c   |  1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++-
  drivers/gpu/drm/i915/i915_pmu.c   | 25 ++-
  drivers/gpu/drm/i915/i915_pmu.h   |  2 +-
  include/uapi/drm/i915_drm.h   | 13 +++-
  8 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h 
b/drivers/gpu/drm/i915/gt/intel_engine.h
index b58c30ac8ef02..57af7ec8ecd82 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head 
*requests,
  
  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,

   ktime_t *now);
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine);
  
  void intel_engine_get_hung_entity(struct intel_engine_cs *engine,

  struct intel_context **ce, struct 
i915_request **rq);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 84a75c95f3f7d..1c9ffb1ae9889 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct 
intel_engine_cs *engine, ktime_t *now)
return engine->busyness(engine, now);
  }
  
+/**

+ * intel_engine_get_busy_ticks() - Return current accumulated engine busyness
+ * ticks
+ * @engine: engine to report on
+ *
+ * Returns accumulated ticks @engine was busy since engine stats were enabled.
+ */
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine)
+{
+   if (!engine->busyness_ticks ||
+   !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS))
+   return 0;
+
+   return engine->busyness_ticks(engine);
+}
+
  struct intel_context *
  intel_engine_create_virtual(struct intel_engine_cs **siblings,
unsigned int count, unsigned long flags)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 40fd8f984d64b..a88d40c74d604 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -548,6 +548,11 @@ struct intel_engine_cs {
ktime_t (*busyness)(struct intel_engine_cs *engine,
ktime_t *now);
  
+	/*

+* Get engine busyness ticks
+*/
+   u64 (*busyness_ticks)(struct intel_engine_cs *engine);
+
struct intel_engine_execlists execlists;
  
  	/*

@@ -574,6 +579,7 @@ struct intel_engine_cs {
  #define I915_ENGINE_HAS_EU_PRIORITYBIT(10)
  #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
  #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
+#define I915_ENGINE_SUPPORTS_TICKS_STATS   BIT(13)
unsigned int flags;
  
  	/*

@@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs 
*engine)
return engine->flags & I915_ENGINE_SUPPORTS_STATS;
  }
  
+static inline bool

+intel_engine_supports_tick_stats(const struct intel_engine_cs *engine)
+{
+   return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS;
+}
+
  static inline bool
  intel_engine_has_preemption(const struct intel_engine_cs *engine)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index dcedff41a825f..69eb610b5ab0a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private 
*i915)
MAP(HAS_PREEMPTION, PREEMPTION),
MAP(HAS_SEMAPHORES, SEMAPHORES),
MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS),
+   MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS),
  #undef MAP
};
struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0c1fee5360777..71749fb9ad35b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct 
intel_guc *guc, ktime_t *now)
guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
  }
  
-/*

- * Unlike the execlist mode of submission total and active times are in terms 

Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats

2023-09-22 Thread Tvrtko Ursulin



On 22/09/2023 12:03, Adrián Larumbe wrote:

On 21.09.2023 11:14, Tvrtko Ursulin wrote:


On 20/09/2023 16:32, Tvrtko Ursulin wrote:


On 20/09/2023 00:34, Adrián Larumbe wrote:

The current implementation will try to pick the highest available size
display unit as soon as the BO size exceeds that of the previous
multiplier. That can lead to loss of precision in contexts of low memory
usage.

The new selection criteria try to preserve precision, whilst also
increasing the display unit selection threshold to render more accurate
values.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
   drivers/gpu/drm/drm_file.c | 5 -
   1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 762965e3d503..34cfa128ffe5 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct
drm_pending_event *e)
   }
   EXPORT_SYMBOL(drm_send_event);
+#define UPPER_UNIT_THRESHOLD 100
+
   static void print_size(struct drm_printer *p, const char *stat,
  const char *region, u64 sz)
   {
@@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p,
const char *stat,
   unsigned u;
   for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
-    if (sz < SZ_1K)
+    if ((sz & (SZ_1K - 1)) &&


IS_ALIGNED worth it at all?


+    sz < UPPER_UNIT_THRESHOLD * SZ_1K)
   break;


Excuse me for a late comment (I was away). I did not get what what is
special about a ~10% threshold? Sounds to me just going with the lower
unit, when size is not aligned to the higher one, would be better than
sometimes precision-sometimes-not.


FWIW both current and the threshold option make testing the feature very
annoying.


How so?


I have to build in the knowledge of implementation details of 
print_size() into my IGT in order to use the right size BOs, so test is 
able to verify stats move as expected. It just feels wrong.



So I'd really propose we simply use smaller unit when unaligned.


Like I said in the previous reply, for drm files whose overall BO size sum is 
enormous
but not a multiple of a MiB, this would render huge number representations in 
KiB.
I don't find this particularly comfortable to read, and then this extra 
precision
would mean nothing to nvtop or gputop, which would have to scale the size to 
their
available screen dimensions when plotting them.


I don't think numbers in KiB are so huge.

And I don't think people will end up reading them manually a lot anyway, 
since you have to hunt the pid, and fd, etc.. It is much more realistic 
that some tool like gputop will be used.


And I don't think consistency of units across drivers or whatever 
matters. Even better to keep userspace parser on their toes and make 
then follow drm-usage-stats.rst and not any implementations, at some 
point in time.


Regards,

Tvrtko


Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics

2023-09-22 Thread Tvrtko Ursulin



On 22/09/2023 11:57, Adrián Larumbe wrote:

On 20.09.2023 16:40, Tvrtko Ursulin wrote:

On 20/09/2023 00:34, Adrián Larumbe wrote:

The drm-stats fdinfo tags made available to user space are drm-engine,
drm-cycles, drm-max-freq and drm-curfreq, one per job slot.

This deviates from standard practice in other DRM drivers, where a single
set of key:value pairs is provided for the whole render engine. However,
Panfrost has separate queues for fragment and vertex/tiler jobs, so a
decision was made to calculate bus cycles and workload times separately.

Maximum operating frequency is calculated at devfreq initialisation time.
Current frequency is made available to user space because nvtop uses it
when performing engine usage calculations.

It is important to bear in mind that both GPU cycle and kernel time numbers
provided are at best rough estimations, and always reported in excess from
the actual figure because of two reasons:
   - Excess time because of the delay between the end of a job processing,
 the subsequent job IRQ and the actual time of the sample.
   - Time spent in the engine queue waiting for the GPU to pick up the next
 job.

To avoid race conditions during enablement/disabling, a reference counting
mechanism was introduced, and a job flag that tells us whether a given job
increased the refcount. This is necessary, because user space can toggle
cycle counting through a debugfs file, and a given job might have been in
flight by the time cycle counting was disabled.

The main goal of the debugfs cycle counter knob is letting tools like nvtop
or IGT's gputop switch it at any time, to avoid power waste in case no
engine usage measuring is necessary.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
   drivers/gpu/drm/panfrost/Makefile   |  2 +
   drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 
   drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 +
   drivers/gpu/drm/panfrost/panfrost_devfreq.c |  8 +++
   drivers/gpu/drm/panfrost/panfrost_devfreq.h |  3 ++
   drivers/gpu/drm/panfrost/panfrost_device.c  |  2 +
   drivers/gpu/drm/panfrost/panfrost_device.h  | 13 +
   drivers/gpu/drm/panfrost/panfrost_drv.c | 57 -
   drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++
   drivers/gpu/drm/panfrost/panfrost_gpu.h |  4 ++
   drivers/gpu/drm/panfrost/panfrost_job.c | 24 +
   drivers/gpu/drm/panfrost/panfrost_job.h |  5 ++
   12 files changed, 191 insertions(+), 1 deletion(-)
   create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c
   create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h

diff --git a/drivers/gpu/drm/panfrost/Makefile 
b/drivers/gpu/drm/panfrost/Makefile
index 7da2b3f02ed9..2c01c1e7523e 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -12,4 +12,6 @@ panfrost-y := \
panfrost_perfcnt.o \
panfrost_dump.o
+panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o
+
   obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c 
b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
new file mode 100644
index ..cc14eccba206
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2023 Collabora ltd. */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "panfrost_device.h"
+#include "panfrost_gpu.h"
+#include "panfrost_debugfs.h"
+
+void panfrost_debugfs_init(struct drm_minor *minor)
+{
+   struct drm_device *dev = minor->dev;
+   struct panfrost_device *pfdev = 
platform_get_drvdata(to_platform_device(dev->dev));
+
+   debugfs_create_atomic_t("profile", 0600, minor->debugfs_root, 
>profile_mode);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h 
b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
new file mode 100644
index ..db1c158bcf2f
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2023 Collabora ltd.
+ */
+
+#ifndef PANFROST_DEBUGFS_H
+#define PANFROST_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+void panfrost_debugfs_init(struct drm_minor *minor);
+#endif
+
+#endif  /* PANFROST_DEBUGFS_H */
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c 
b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 58dfb15a8757..28caffc689e2 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct device *dev,
spin_lock_irqsave(>lock, irqflags);
panfrost_devfreq_update_utilization(pfdevfreq);
+   pfdevfreq->current_frequency = status->current_frequency;
status->total_time = ktime_to_ns(ktime_add(pfdevfreq->busy_time,
   

[PATCH 6/6] drm/i915: Implement fdinfo memory stats printing

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

v2:
 * Only account against the active region.
 * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas)

v3:
 * Update commit text. (Aravind)
 * Update to use memory regions uabi names.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
Cc: Andi Shyti 
Cc: Tejas Upadhyay 
Reviewed-by: Andi Shyti  # v1
Reviewed-by: Aravind Iddamsetty  # v2
---
 drivers/gpu/drm/i915/i915_drm_client.c | 64 ++
 1 file changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index a61356012df8..7efffdaa508d 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref)
 }
 
 #ifdef CONFIG_PROC_FS
+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN])
+{
+   const enum intel_region_id id = obj->mm.region ?
+   obj->mm.region->id : INTEL_REGION_SMEM;
+   const u64 sz = obj->base.size;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   DMA_RESV_USAGE_BOOKKEEP))
+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry(>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, >objects_list) {
+   obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj),
+client_link));
+   if (!obj)
+   continue;
+   obj_meminfo(obj, stats);
+   i915_gem_object_put(obj);
+   }
+   rcu_read_unlock();
+
+   for_each_memory_region(mr, i915, id)
+   drm_print_memory_stats(p,
+  [id],
+  DRM_GEM_OBJECT_RESIDENT |
+  DRM_GEM_OBJECT_PURGEABLE,
+  mr->uabi_name);
+}
+
 static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_RENDER] = "render",
[I915_ENGINE_CLASS_COPY] = "copy",
@@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
 * **
 */
 
+   show_meminfo(p, file);
+
if (GRAPHICS_VER(i915) < 8)
return;
 
-- 
2.39.2



[PATCH 2/6] drm/i915: Record which client owns a VM

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

To enable accounting of indirect client memory usage (such as page tables)
in the following patch, lets start recording the creator of each PPGTT.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 11 ---
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h |  3 +++
 drivers/gpu/drm/i915/gem/selftests/mock_context.c |  4 ++--
 drivers/gpu/drm/i915/gt/intel_gtt.h   |  1 +
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 9a9ff84c90d7..35cf6608180e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -279,7 +279,8 @@ static int proto_context_set_protected(struct 
drm_i915_private *i915,
 }
 
 static struct i915_gem_proto_context *
-proto_context_create(struct drm_i915_private *i915, unsigned int flags)
+proto_context_create(struct drm_i915_file_private *fpriv,
+struct drm_i915_private *i915, unsigned int flags)
 {
struct i915_gem_proto_context *pc, *err;
 
@@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, 
unsigned int flags)
if (!pc)
return ERR_PTR(-ENOMEM);
 
+   pc->fpriv = fpriv;
pc->num_user_engines = -1;
pc->user_engines = NULL;
pc->user_flags = BIT(UCONTEXT_BANNABLE) |
@@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
err = PTR_ERR(ppgtt);
goto err_ctx;
}
+   ppgtt->vm.fpriv = pc->fpriv;
vm = >vm;
}
if (vm)
@@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
/* 0 reserved for invalid/unassigned ppgtt */
xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1);
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(file_priv, i915, 0);
if (IS_ERR(pc)) {
err = PTR_ERR(pc);
goto err;
@@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void 
*data,
 
GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */
args->vm_id = id;
+   ppgtt->vm.fpriv = file_priv;
return 0;
 
 err_put:
@@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, 
void *data,
return -EIO;
}
 
-   ext_data.pc = proto_context_create(i915, args->flags);
+   ext_data.pc = proto_context_create(file->driver_priv, i915,
+  args->flags);
if (IS_ERR(ext_data.pc))
return PTR_ERR(ext_data.pc);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index cb78214a7dcd..c573c067779f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -188,6 +188,9 @@ struct i915_gem_proto_engine {
  * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE.
  */
 struct i915_gem_proto_context {
+   /** @fpriv: Client which creates the context */
+   struct drm_i915_file_private *fpriv;
+
/** @vm: See _gem_context.vm */
struct i915_address_space *vm;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c 
b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
index 8ac6726ec16b..125584ada282 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
@@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file)
int err;
u32 id;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(fpriv, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
@@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915,
struct i915_gem_context *ctx;
struct i915_gem_proto_context *pc;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(NULL, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h 
b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 346ec8ec2edd..8cf62f5134a9 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -248,6 +248,7 @@ struct i915_address_space {
struct drm_mm mm;
struct intel_gt *gt;
struct drm_i915_private *i915;
+   struct drm_i915_file_private *fpriv;
struct device *dma;
u64 total;  /* size addr space maps (ex. 2GB for ggtt) */
u64 reserved;   /* size addr space reserved */
-- 
2.39.2



[PATCH 5/6] drm/i915: Add stable memory region names

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

At the moment memory region names are a bit too varied and too
inconsistent to be used for ABI purposes, like for upcoming fdinfo
memory stats.

System memory can be either system or system-ttm. Local memory has the
instance number appended, others do not. Not only incosistent but thi
kind of implementation detail is uninteresting for intended users of
fdinfo memory stats.

Add a stable name always formed as $type$instance. Could have chosen a
different stable scheme, but I think any consistent and stable scheme
should do just fine.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/intel_memory_region.c | 19 +++
 drivers/gpu/drm/i915/intel_memory_region.h |  1 +
 2 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c 
b/drivers/gpu/drm/i915/intel_memory_region.c
index 3d1fdea9811d..60a03340bbd4 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -216,6 +216,22 @@ static int intel_memory_region_memtest(struct 
intel_memory_region *mem,
return err;
 }
 
+static const char *region_type_str(u16 type)
+{
+   switch (type) {
+   case INTEL_MEMORY_SYSTEM:
+   return "system";
+   case INTEL_MEMORY_LOCAL:
+   return "local";
+   case INTEL_MEMORY_STOLEN_LOCAL:
+   return "stolen-local";
+   case INTEL_MEMORY_STOLEN_SYSTEM:
+   return "stolen-system";
+   default:
+   return "unknown";
+   }
+}
+
 struct intel_memory_region *
 intel_memory_region_create(struct drm_i915_private *i915,
   resource_size_t start,
@@ -244,6 +260,9 @@ intel_memory_region_create(struct drm_i915_private *i915,
mem->type = type;
mem->instance = instance;
 
+   snprintf(mem->uabi_name, sizeof(mem->uabi_name), "%s%u",
+region_type_str(type), instance);
+
mutex_init(>objects.lock);
INIT_LIST_HEAD(>objects.list);
 
diff --git a/drivers/gpu/drm/i915/intel_memory_region.h 
b/drivers/gpu/drm/i915/intel_memory_region.h
index 2953ed5c3248..9ba36454e51b 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.h
+++ b/drivers/gpu/drm/i915/intel_memory_region.h
@@ -80,6 +80,7 @@ struct intel_memory_region {
u16 instance;
enum intel_region_id id;
char name[16];
+   char uabi_name[16];
bool private; /* not for userspace */
 
struct {
-- 
2.39.2



[PATCH 3/6] drm/i915: Track page table backing store usage

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account page table backing store against the owning client memory usage
stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 13944a14ea2d..c3f2b379 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
@@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
-- 
2.39.2



[PATCH 1/6] drm/i915: Add ability for tracking buffer objects per client

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

In order to show per client memory usage lets add some infrastructure
which enables tracking buffer objects owned by clients.

We add a per client list protected by a new per client lock and to support
delayed destruction (post client exit) we make tracked objects hold
references to the owning client.

Also, object memory region teardown is moved to the existing RCU free
callback to allow safe dereference from the fdinfo RCU read section.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +--
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 12 +++
 drivers/gpu/drm/i915/i915_drm_client.c| 36 +++
 drivers/gpu/drm/i915/i915_drm_client.h| 32 +
 4 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index c26d87555825..25eeeb863209 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
INIT_LIST_HEAD(>mm.link);
 
+#ifdef CONFIG_PROC_FS
+   INIT_LIST_HEAD(>client_link);
+#endif
+
INIT_LIST_HEAD(>lut_list);
spin_lock_init(>lut_lock);
 
@@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head)
container_of(head, typeof(*obj), rcu);
struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+   /* We need to keep this alive for RCU read access from fdinfo. */
+   if (obj->mm.n_placements > 1)
+   kfree(obj->mm.placements);
+
i915_gem_object_free(obj);
 
GEM_BUG_ON(!atomic_read(>mm.free_count));
@@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj)
if (obj->ops->release)
obj->ops->release(obj);
 
-   if (obj->mm.n_placements > 1)
-   kfree(obj->mm.placements);
-
if (obj->shares_resv_from)
i915_vm_resv_put(obj->shares_resv_from);
 
@@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object 
*gem_obj)
 
GEM_BUG_ON(i915_gem_object_is_framebuffer(obj));
 
+   i915_drm_client_remove_object(obj);
+
/*
 * Before we free the object, make sure any pure RCU-only
 * read-side critical sections are complete, e.g.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 2292404007c8..0c5cdab278b6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -302,6 +302,18 @@ struct drm_i915_gem_object {
 */
struct i915_address_space *shares_resv_from;
 
+#ifdef CONFIG_PROC_FS
+   /**
+* @client: @i915_drm_client which created the object
+*/
+   struct i915_drm_client *client;
+
+   /**
+* @client_link: Link into @i915_drm_client.objects_list
+*/
+   struct list_head client_link;
+#endif
+
union {
struct rcu_head rcu;
struct llist_node freed;
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2a44b3876cb5..2e5e69edc0f9 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void)
kref_init(>kref);
spin_lock_init(>ctx_lock);
INIT_LIST_HEAD(>ctx_list);
+#ifdef CONFIG_PROC_FS
+   spin_lock_init(>objects_lock);
+   INIT_LIST_HEAD(>objects_list);
+#endif
 
return client;
 }
@@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
show_client_class(p, i915, file_priv->client, i);
 }
+
+void i915_drm_client_add_object(struct i915_drm_client *client,
+   struct drm_i915_gem_object *obj)
+{
+   unsigned long flags;
+
+   GEM_WARN_ON(obj->client);
+   GEM_WARN_ON(!list_empty(>client_link));
+
+   spin_lock_irqsave(>objects_lock, flags);
+   obj->client = i915_drm_client_get(client);
+   list_add_tail_rcu(>client_link, >objects_list);
+   spin_unlock_irqrestore(>objects_lock, flags);
+}
+
+bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj)
+{
+   struct i915_drm_client *client = fetch_and_zero(>client);
+   unsigned long flags;
+
+   /* Object may not be associated with a client. */
+   if (!client)
+   return false;
+
+   spin_lock_irqsave(>objects_lock, flags);
+   list_del_rcu(>client_link);
+   spin_unlock_irqrestore(>objects_lock, flags);
+
+   i915_drm_client_put(client);
+

[PATCH 4/6] drm/i915: Account ring buffer and context state storage

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account ring buffers and logical context space against the owning client
memory usage stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_context.c | 14 ++
 drivers/gpu/drm/i915/i915_drm_client.c  | 10 ++
 drivers/gpu/drm/i915/i915_drm_client.h  |  9 +
 3 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index a53b26178f0a..a2f1245741bb 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -6,6 +6,7 @@
 #include "gem/i915_gem_context.h"
 #include "gem/i915_gem_pm.h"
 
+#include "i915_drm_client.h"
 #include "i915_drv.h"
 #include "i915_trace.h"
 
@@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine)
 
 int intel_context_alloc_state(struct intel_context *ce)
 {
+   struct i915_gem_context *ctx;
int err = 0;
 
if (mutex_lock_interruptible(>pin_mutex))
@@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce)
goto unlock;
 
set_bit(CONTEXT_ALLOC_BIT, >flags);
+
+   rcu_read_lock();
+   ctx = rcu_dereference(ce->gem_context);
+   if (ctx && !kref_get_unless_zero(>ref))
+   ctx = NULL;
+   rcu_read_unlock();
+   if (ctx) {
+   if (ctx->client)
+   i915_drm_client_add_context_objects(ctx->client,
+   ce);
+   i915_gem_context_put(ctx);
+   }
}
 
 unlock:
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2e5e69edc0f9..a61356012df8 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct 
drm_i915_gem_object *obj)
 
return true;
 }
+
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce)
+{
+   if (ce->state)
+   i915_drm_client_add_object(client, ce->state->obj);
+
+   if (ce->ring != ce->engine->legacy.ring && ce->ring->vma)
+   i915_drm_client_add_object(client, ce->ring->vma->obj);
+}
 #endif
diff --git a/drivers/gpu/drm/i915/i915_drm_client.h 
b/drivers/gpu/drm/i915/i915_drm_client.h
index 5f58fdf7dcb8..69cedfcd3d69 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.h
+++ b/drivers/gpu/drm/i915/i915_drm_client.h
@@ -14,6 +14,7 @@
 
 #include "i915_file_private.h"
 #include "gem/i915_gem_object_types.h"
+#include "gt/intel_context_types.h"
 
 #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE
 
@@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file);
 void i915_drm_client_add_object(struct i915_drm_client *client,
struct drm_i915_gem_object *obj);
 bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj);
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce);
 #else
 static inline void i915_drm_client_add_object(struct i915_drm_client *client,
  struct drm_i915_gem_object *obj)
@@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct 
i915_drm_client *client,
 static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object 
*obj)
 {
 }
+
+static inline void
+i915_drm_client_add_context_objects(struct i915_drm_client *client,
+   struct intel_context *ce)
+{
+}
 #endif
 
 #endif /* !__I915_DRM_CLIENT_H__ */
-- 
2.39.2



[PATCH v7 0/6] fdinfo memory stats

2023-09-22 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

A short series to enable fdinfo memory stats for i915.

I added tracking of most classes of objects (user objects, page tables, context
state, ring buffers) which contribute to client's memory footprint and am
accouting their memory use along the similar lines as in Rob's msm code, just
that with i915 specific code we can show a memory region breakdown and so
support discrete and multi-tile GPUs properly. And also reflect that our objects
can have multiple allowed backing stores.

The existing helper Rob added is then used to dump the per memory region stats
to fdinfo.

The basic objects-per-client infrastructure can later be extended to cover all
objects and so avoid needing to walk the IDR under the client's file table lock,
which would further avoid distburbing the running clients by parallel fdinfo
readers.

Example fdinfo format:

# cat /proc/1383/fdinfo/8
pos:0
flags:  0212
mnt_id: 21
ino:397
drm-driver: i915
drm-client-id:  18
drm-pdev:   :00:02.0
drm-total-system:   125 MiB
drm-shared-system:  16 MiB
drm-active-system:  110 MiB
drm-resident-system:125 MiB
drm-purgeable-system:   2 MiB
drm-total-stolen-system:0
drm-shared-stolen-system:   0
drm-active-stolen-system:   0
drm-resident-stolen-system: 0
drm-purgeable-stolen-system:0
drm-engine-render:  25662044495 ns
drm-engine-copy:0 ns
drm-engine-video:   0 ns
drm-engine-video-enhance:   0 ns

Example gputop output:

DRM minor 0
 PID SMEM  SMEMRSS   render copy videoNAME
1233 124M 124M |||||||| neverball
1130  59M  59M |█▌  ||||||| Xorg
1207  12M  12M |||||||| xfwm4

Or with Wayland:

DRM minor 0
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2093 191M 191M |▊  ||   ||   ||   | 
gnome-shell
DRM minor 128
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2551  71M  71M |██▉||   ||   ||   | 
neverball
2553  50M  50M |   ||   ||   ||   | 
Xwayland

Example intel_gpu_top output, aggregated mode:

intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 -   21/ 577 MHz;  71% RC6
  8 irqs/s

 ENGINES BUSY   MI_SEMA MI_WAIT
   Render/3D2.80% |▉  |  0%  0%
 Blitter0.01% |▏  |  0%  0%
   Video0.00% |   |  0%  0%
VideoEnhance0.00% |   |  0%  0%

  PID  MEM  RSS Render/3D  BlitterVideoNAME
50783 109M 107M |▎   ||||||| neverball

Region breakdown mode (needs more width for best experience):

intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 -   18/ 555 MHz;  65% RC6
  8 irqs/s

 ENGINES BUSY   MI_SEMA MI_WAIT
   Render/3D2.52% |▉  |  0%  0%
 Blitter0.00% |   |  0%  0%
   Video0.00% |   |  0%  0%
VideoEnhance0.00% |   |  0%  0%

  PID  RAM  RSS VRAM VRSS Video NAME
50783  34M  32M  75M  75M |▏  ||   ||   ||   | neverball

v2:
 * Now actually per client.

v3:
 * Track imported dma-buf objects.

v4:
 * Rely on DRM GEM handles for tracking user objects.
 * Fix internal object accounting (no placements).

v5:
 * Fixed brain fart of overwriting the loop cursor.
 * Fixed object destruction racing with fdinfo reads.
 * Take reference to GEM context while using it.

v6:
 * Rebase, cover letter update.

v7:
 * New patch in series for making region names consistent and stable.

Test-with: 20230922134437.234888-1-tvrtko.ursu...@linux.intel.com

Tvrtko Ursulin (6):
  drm/i915: Add ability for tracking buffer objects per client
  drm/i915: Record which client owns a VM
  drm/i915: Track page table backing store usage
  drm/i915: Account ring buffer and context state storage
  drm/i915: Add stable memory region names
  drm/i915: Implement fdinfo memory stats printing

 drivers/gpu/drm/i915/gem/i915_gem_context.c   |  11 +-
 .../gpu/drm/i915/gem/i915_gem_context_types.h |   3 +
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  13 ++-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  12 ++
 .../gpu/drm/i915/gem/selftests/mock_context.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_context.c   |  14 +++
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   6 +
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   1 +
 drivers/gpu/drm/i915/i915_drm_client.c| 110 ++
 drivers/gpu/drm/i915/i915_drm_client.h

Re: [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing

2023-09-22 Thread Tvrtko Ursulin



On 22/09/2023 09:48, Iddamsetty, Aravind wrote:



On 21-09-2023 17:18, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

Objects with multiple possible placements are reported in multiple
regions for total and shared sizes, while other categories are


I guess you forgot to correct this.


Ah yes, will fix.




counted only for the currently active region.

v2:
  * Only account against the active region.
  * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas)

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
Cc: Andi Shyti 
Cc: Tejas Upadhyay 
Reviewed-by: Andi Shyti  # v1
---
  drivers/gpu/drm/i915/i915_drm_client.c | 64 ++
  1 file changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index a61356012df8..94abc2fb2ea6 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref)
  }
  
  #ifdef CONFIG_PROC_FS

+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN])
+{
+   const enum intel_region_id id = obj->mm.region ?
+   obj->mm.region->id : INTEL_REGION_SMEM;
+   const u64 sz = obj->base.size;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   DMA_RESV_USAGE_BOOKKEEP))
+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry(>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, >objects_list) {
+   obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj),
+client_link));
+   if (!obj)
+   continue;
+   obj_meminfo(obj, stats);
+   i915_gem_object_put(obj);
+   }
+   rcu_read_unlock();
+
+   for_each_memory_region(mr, i915, id)
+   drm_print_memory_stats(p,
+  [id],
+  DRM_GEM_OBJECT_RESIDENT |
+  DRM_GEM_OBJECT_PURGEABLE,
+  mr->name);
+}
+
  static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_RENDER] = "render",
[I915_ENGINE_CLASS_COPY] = "copy",
@@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
 * **
 */
  
+	show_meminfo(p, file);

+
if (GRAPHICS_VER(i915) < 8)
return;
  


Reviewed-by: Aravind Iddamsetty 


Thank you! Would you be able to also look at the IGTs I posted yesterday?

Regards,

Tvrtko


[PATCH 5/5] drm/i915: Implement fdinfo memory stats printing

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

Objects with multiple possible placements are reported in multiple
regions for total and shared sizes, while other categories are
counted only for the currently active region.

v2:
 * Only account against the active region.
 * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas)

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
Cc: Andi Shyti 
Cc: Tejas Upadhyay 
Reviewed-by: Andi Shyti  # v1
---
 drivers/gpu/drm/i915/i915_drm_client.c | 64 ++
 1 file changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index a61356012df8..94abc2fb2ea6 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref)
 }
 
 #ifdef CONFIG_PROC_FS
+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN])
+{
+   const enum intel_region_id id = obj->mm.region ?
+   obj->mm.region->id : INTEL_REGION_SMEM;
+   const u64 sz = obj->base.size;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   DMA_RESV_USAGE_BOOKKEEP))
+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry(>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, >objects_list) {
+   obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj),
+client_link));
+   if (!obj)
+   continue;
+   obj_meminfo(obj, stats);
+   i915_gem_object_put(obj);
+   }
+   rcu_read_unlock();
+
+   for_each_memory_region(mr, i915, id)
+   drm_print_memory_stats(p,
+  [id],
+  DRM_GEM_OBJECT_RESIDENT |
+  DRM_GEM_OBJECT_PURGEABLE,
+  mr->name);
+}
+
 static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_RENDER] = "render",
[I915_ENGINE_CLASS_COPY] = "copy",
@@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
 * **
 */
 
+   show_meminfo(p, file);
+
if (GRAPHICS_VER(i915) < 8)
return;
 
-- 
2.39.2



[PATCH 2/5] drm/i915: Record which client owns a VM

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

To enable accounting of indirect client memory usage (such as page tables)
in the following patch, lets start recording the creator of each PPGTT.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 11 ---
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h |  3 +++
 drivers/gpu/drm/i915/gem/selftests/mock_context.c |  4 ++--
 drivers/gpu/drm/i915/gt/intel_gtt.h   |  1 +
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 9a9ff84c90d7..35cf6608180e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -279,7 +279,8 @@ static int proto_context_set_protected(struct 
drm_i915_private *i915,
 }
 
 static struct i915_gem_proto_context *
-proto_context_create(struct drm_i915_private *i915, unsigned int flags)
+proto_context_create(struct drm_i915_file_private *fpriv,
+struct drm_i915_private *i915, unsigned int flags)
 {
struct i915_gem_proto_context *pc, *err;
 
@@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, 
unsigned int flags)
if (!pc)
return ERR_PTR(-ENOMEM);
 
+   pc->fpriv = fpriv;
pc->num_user_engines = -1;
pc->user_engines = NULL;
pc->user_flags = BIT(UCONTEXT_BANNABLE) |
@@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
err = PTR_ERR(ppgtt);
goto err_ctx;
}
+   ppgtt->vm.fpriv = pc->fpriv;
vm = >vm;
}
if (vm)
@@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
/* 0 reserved for invalid/unassigned ppgtt */
xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1);
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(file_priv, i915, 0);
if (IS_ERR(pc)) {
err = PTR_ERR(pc);
goto err;
@@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void 
*data,
 
GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */
args->vm_id = id;
+   ppgtt->vm.fpriv = file_priv;
return 0;
 
 err_put:
@@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, 
void *data,
return -EIO;
}
 
-   ext_data.pc = proto_context_create(i915, args->flags);
+   ext_data.pc = proto_context_create(file->driver_priv, i915,
+  args->flags);
if (IS_ERR(ext_data.pc))
return PTR_ERR(ext_data.pc);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index cb78214a7dcd..c573c067779f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -188,6 +188,9 @@ struct i915_gem_proto_engine {
  * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE.
  */
 struct i915_gem_proto_context {
+   /** @fpriv: Client which creates the context */
+   struct drm_i915_file_private *fpriv;
+
/** @vm: See _gem_context.vm */
struct i915_address_space *vm;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c 
b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
index 8ac6726ec16b..125584ada282 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
@@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file)
int err;
u32 id;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(fpriv, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
@@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915,
struct i915_gem_context *ctx;
struct i915_gem_proto_context *pc;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(NULL, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h 
b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 346ec8ec2edd..8cf62f5134a9 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -248,6 +248,7 @@ struct i915_address_space {
struct drm_mm mm;
struct intel_gt *gt;
struct drm_i915_private *i915;
+   struct drm_i915_file_private *fpriv;
struct device *dma;
u64 total;  /* size addr space maps (ex. 2GB for ggtt) */
u64 reserved;   /* size addr space reserved */
-- 
2.39.2



[PATCH 4/5] drm/i915: Account ring buffer and context state storage

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account ring buffers and logical context space against the owning client
memory usage stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_context.c | 14 ++
 drivers/gpu/drm/i915/i915_drm_client.c  | 10 ++
 drivers/gpu/drm/i915/i915_drm_client.h  |  9 +
 3 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index a53b26178f0a..a2f1245741bb 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -6,6 +6,7 @@
 #include "gem/i915_gem_context.h"
 #include "gem/i915_gem_pm.h"
 
+#include "i915_drm_client.h"
 #include "i915_drv.h"
 #include "i915_trace.h"
 
@@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine)
 
 int intel_context_alloc_state(struct intel_context *ce)
 {
+   struct i915_gem_context *ctx;
int err = 0;
 
if (mutex_lock_interruptible(>pin_mutex))
@@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce)
goto unlock;
 
set_bit(CONTEXT_ALLOC_BIT, >flags);
+
+   rcu_read_lock();
+   ctx = rcu_dereference(ce->gem_context);
+   if (ctx && !kref_get_unless_zero(>ref))
+   ctx = NULL;
+   rcu_read_unlock();
+   if (ctx) {
+   if (ctx->client)
+   i915_drm_client_add_context_objects(ctx->client,
+   ce);
+   i915_gem_context_put(ctx);
+   }
}
 
 unlock:
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2e5e69edc0f9..a61356012df8 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct 
drm_i915_gem_object *obj)
 
return true;
 }
+
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce)
+{
+   if (ce->state)
+   i915_drm_client_add_object(client, ce->state->obj);
+
+   if (ce->ring != ce->engine->legacy.ring && ce->ring->vma)
+   i915_drm_client_add_object(client, ce->ring->vma->obj);
+}
 #endif
diff --git a/drivers/gpu/drm/i915/i915_drm_client.h 
b/drivers/gpu/drm/i915/i915_drm_client.h
index 5f58fdf7dcb8..69cedfcd3d69 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.h
+++ b/drivers/gpu/drm/i915/i915_drm_client.h
@@ -14,6 +14,7 @@
 
 #include "i915_file_private.h"
 #include "gem/i915_gem_object_types.h"
+#include "gt/intel_context_types.h"
 
 #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE
 
@@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file);
 void i915_drm_client_add_object(struct i915_drm_client *client,
struct drm_i915_gem_object *obj);
 bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj);
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce);
 #else
 static inline void i915_drm_client_add_object(struct i915_drm_client *client,
  struct drm_i915_gem_object *obj)
@@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct 
i915_drm_client *client,
 static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object 
*obj)
 {
 }
+
+static inline void
+i915_drm_client_add_context_objects(struct i915_drm_client *client,
+   struct intel_context *ce)
+{
+}
 #endif
 
 #endif /* !__I915_DRM_CLIENT_H__ */
-- 
2.39.2



[PATCH 3/5] drm/i915: Track page table backing store usage

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account page table backing store against the owning client memory usage
stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 13944a14ea2d..c3f2b379 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
@@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
-- 
2.39.2



[PATCH 1/5] drm/i915: Add ability for tracking buffer objects per client

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

In order to show per client memory usage lets add some infrastructure
which enables tracking buffer objects owned by clients.

We add a per client list protected by a new per client lock and to support
delayed destruction (post client exit) we make tracked objects hold
references to the owning client.

Also, object memory region teardown is moved to the existing RCU free
callback to allow safe dereference from the fdinfo RCU read section.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +--
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 12 +++
 drivers/gpu/drm/i915/i915_drm_client.c| 36 +++
 drivers/gpu/drm/i915/i915_drm_client.h| 32 +
 4 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index c26d87555825..25eeeb863209 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
INIT_LIST_HEAD(>mm.link);
 
+#ifdef CONFIG_PROC_FS
+   INIT_LIST_HEAD(>client_link);
+#endif
+
INIT_LIST_HEAD(>lut_list);
spin_lock_init(>lut_lock);
 
@@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head)
container_of(head, typeof(*obj), rcu);
struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+   /* We need to keep this alive for RCU read access from fdinfo. */
+   if (obj->mm.n_placements > 1)
+   kfree(obj->mm.placements);
+
i915_gem_object_free(obj);
 
GEM_BUG_ON(!atomic_read(>mm.free_count));
@@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj)
if (obj->ops->release)
obj->ops->release(obj);
 
-   if (obj->mm.n_placements > 1)
-   kfree(obj->mm.placements);
-
if (obj->shares_resv_from)
i915_vm_resv_put(obj->shares_resv_from);
 
@@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object 
*gem_obj)
 
GEM_BUG_ON(i915_gem_object_is_framebuffer(obj));
 
+   i915_drm_client_remove_object(obj);
+
/*
 * Before we free the object, make sure any pure RCU-only
 * read-side critical sections are complete, e.g.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 2292404007c8..0c5cdab278b6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -302,6 +302,18 @@ struct drm_i915_gem_object {
 */
struct i915_address_space *shares_resv_from;
 
+#ifdef CONFIG_PROC_FS
+   /**
+* @client: @i915_drm_client which created the object
+*/
+   struct i915_drm_client *client;
+
+   /**
+* @client_link: Link into @i915_drm_client.objects_list
+*/
+   struct list_head client_link;
+#endif
+
union {
struct rcu_head rcu;
struct llist_node freed;
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2a44b3876cb5..2e5e69edc0f9 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void)
kref_init(>kref);
spin_lock_init(>ctx_lock);
INIT_LIST_HEAD(>ctx_list);
+#ifdef CONFIG_PROC_FS
+   spin_lock_init(>objects_lock);
+   INIT_LIST_HEAD(>objects_list);
+#endif
 
return client;
 }
@@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
show_client_class(p, i915, file_priv->client, i);
 }
+
+void i915_drm_client_add_object(struct i915_drm_client *client,
+   struct drm_i915_gem_object *obj)
+{
+   unsigned long flags;
+
+   GEM_WARN_ON(obj->client);
+   GEM_WARN_ON(!list_empty(>client_link));
+
+   spin_lock_irqsave(>objects_lock, flags);
+   obj->client = i915_drm_client_get(client);
+   list_add_tail_rcu(>client_link, >objects_list);
+   spin_unlock_irqrestore(>objects_lock, flags);
+}
+
+bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj)
+{
+   struct i915_drm_client *client = fetch_and_zero(>client);
+   unsigned long flags;
+
+   /* Object may not be associated with a client. */
+   if (!client)
+   return false;
+
+   spin_lock_irqsave(>objects_lock, flags);
+   list_del_rcu(>client_link);
+   spin_unlock_irqrestore(>objects_lock, flags);
+
+   i915_drm_client_put(client);
+

[PATCH v7 0/5] fdinfo memory stats

2023-09-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

A short series to enable fdinfo memory stats for i915.

I added tracking of most classes of objects (user objects, page tables, context
state, ring buffers) which contribute to client's memory footprint and am
accouting their memory use along the similar lines as in Rob's msm code, just
that with i915 specific code we can show a memory region breakdown and so
support discrete and multi-tile GPUs properly. And also reflect that our objects
can have multiple allowed backing stores.

The existing helper Rob added is then used to dump the per memory region stats
to fdinfo.

The basic objects-per-client infrastructure can later be extended to cover all
objects and so avoid needing to walk the IDR under the client's file table lock,
which would further avoid distburbing the running clients by parallel fdinfo
readers.

Example fdinfo format:

# cat /proc/1383/fdinfo/8
pos:0
flags:  0212
mnt_id: 21
ino:397
drm-driver: i915
drm-client-id:  18
drm-pdev:   :00:02.0
drm-total-system:   125 MiB
drm-shared-system:  16 MiB
drm-active-system:  110 MiB
drm-resident-system:125 MiB
drm-purgeable-system:   2 MiB
drm-total-stolen-system:0
drm-shared-stolen-system:   0
drm-active-stolen-system:   0
drm-resident-stolen-system: 0
drm-purgeable-stolen-system:0
drm-engine-render:  25662044495 ns
drm-engine-copy:0 ns
drm-engine-video:   0 ns
drm-engine-video-enhance:   0 ns

Example gputop output:

DRM minor 0
 PID SMEM  SMEMRSS   render copy videoNAME
1233 124M 124M |||||||| neverball
1130  59M  59M |█▌  ||||||| Xorg
1207  12M  12M |||||||| xfwm4

Or with Wayland:

DRM minor 0
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2093 191M 191M |▊  ||   ||   ||   | 
gnome-shell
DRM minor 128
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2551  71M  71M |██▉||   ||   ||   | 
neverball
2553  50M  50M |   ||   ||   ||   | 
Xwayland

v2:
 * Now actually per client.

v3:
 * Track imported dma-buf objects.

v4:
 * Rely on DRM GEM handles for tracking user objects.
 * Fix internal object accounting (no placements).

v5:
 * Fixed brain fart of overwriting the loop cursor.
 * Fixed object destruction racing with fdinfo reads.
 * Take reference to GEM context while using it.

v6:
 * Rebase, cover letter update.

v7:
 * Account against active region only.
 * Cover all dma_resv usage when testing for activity.

Test-with: 20230921114557.192629-1-tvrtko.ursu...@linux.intel.com

Tvrtko Ursulin (5):
  drm/i915: Add ability for tracking buffer objects per client
  drm/i915: Record which client owns a VM
  drm/i915: Track page table backing store usage
  drm/i915: Account ring buffer and context state storage
  drm/i915: Implement fdinfo memory stats printing

 drivers/gpu/drm/i915/gem/i915_gem_context.c   |  11 +-
 .../gpu/drm/i915/gem/i915_gem_context_types.h |   3 +
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  13 ++-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  12 ++
 .../gpu/drm/i915/gem/selftests/mock_context.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_context.c   |  14 +++
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   6 +
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   1 +
 drivers/gpu/drm/i915/i915_drm_client.c| 110 ++
 drivers/gpu/drm/i915/i915_drm_client.h|  41 +++
 10 files changed, 207 insertions(+), 8 deletions(-)

-- 
2.39.2



Re: [Intel-gfx] [PATCH] drm/i915/gem: Allow users to disable waitboost

2023-09-21 Thread Tvrtko Ursulin



On 20/09/2023 22:56, Vinay Belgaumkar wrote:

Provide a bit to disable waitboost while waiting on a gem object.
Waitboost results in increased power consumption by requesting RP0
while waiting for the request to complete. Add a bit in the gem_wait()
IOCTL where this can be disabled.

This is related to the libva API change here -
Link: 
https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7


This link does not appear to lead to userspace code using this uapi?



Cc: Rodrigo Vivi 
Signed-off-by: Vinay Belgaumkar 
---
  drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++---
  drivers/gpu/drm/i915/i915_request.c  | 3 ++-
  drivers/gpu/drm/i915/i915_request.h  | 1 +
  include/uapi/drm/i915_drm.h  | 1 +
  4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c 
b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
index d4b918fb11ce..955885ec859d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
@@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv *resv,
struct dma_fence *fence;
long ret = timeout ?: 1;
  
-	i915_gem_object_boost(resv, flags);

+   if (!(flags & I915_WAITBOOST_DISABLE))
+   i915_gem_object_boost(resv, flags);
  
  	dma_resv_iter_begin(, resv,

dma_resv_usage_rw(flags & I915_WAIT_ALL));
@@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, 
struct drm_file *file)
ktime_t start;
long ret;
  
-	if (args->flags != 0)

+   if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE)
return -EINVAL;
  
  	obj = i915_gem_object_lookup(file, args->bo_handle);

@@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, 
struct drm_file *file)
ret = i915_gem_object_wait(obj,
   I915_WAIT_INTERRUPTIBLE |
   I915_WAIT_PRIORITY |
-  I915_WAIT_ALL,
+  I915_WAIT_ALL |
+  (args->flags & I915_GEM_WAITBOOST_DISABLE ?
+   I915_WAITBOOST_DISABLE : 0),
   to_wait_timeout(args->timeout_ns));
  
  	if (args->timeout_ns > 0) {

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index f59081066a19..2957409b4b2a 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct i915_request *rq,
 * but at a cost of spending more power processing the workload
 * (bad for battery).
 */
-   if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq))
+   if (!(flags & I915_WAITBOOST_DISABLE) && (flags & I915_WAIT_PRIORITY) &&
+   !i915_request_started(rq))
intel_rps_boost(rq);
  
  	wait.tsk = current;

diff --git a/drivers/gpu/drm/i915/i915_request.h 
b/drivers/gpu/drm/i915/i915_request.h
index 0ac55b2e4223..3cc00e8254dc 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq,
  #define I915_WAIT_INTERRUPTIBLE   BIT(0)
  #define I915_WAIT_PRIORITYBIT(1) /* small priority bump for the request */
  #define I915_WAIT_ALL BIT(2) /* used by i915_gem_object_wait() */
+#define I915_WAITBOOST_DISABLE BIT(3) /* used by i915_gem_object_wait() */
  
  void i915_request_show(struct drm_printer *m,

   const struct i915_request *rq,
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 7000e5910a1d..4adee70e39cf 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait {
/** Handle of BO we shall wait on */
__u32 bo_handle;
__u32 flags;
+#define I915_GEM_WAITBOOST_DISABLE  (1u<<0)


Probably would be good to avoid mentioning waitboost in the uapi since 
so far it wasn't an explicit feature/contract. Something like 
I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority?


I also wonder if there could be a possible angle to help Rob (+cc) 
upstream the syncobj/fence deadline code if our media driver might make 
use of that somehow.


Like if either we could wire up the deadline into GEM_WAIT (in a 
backward compatible manner), or if media could use sync fd wait instead. 
Assuming they have an out fence already, which may not be true.


Regards,

Tvrtko


/** Number of nanoseconds to wait, Returns time remaining. */
__s64 timeout_ns;
  };


Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats

2023-09-21 Thread Tvrtko Ursulin



On 20/09/2023 16:32, Tvrtko Ursulin wrote:


On 20/09/2023 00:34, Adrián Larumbe wrote:

The current implementation will try to pick the highest available size
display unit as soon as the BO size exceeds that of the previous
multiplier. That can lead to loss of precision in contexts of low memory
usage.

The new selection criteria try to preserve precision, whilst also
increasing the display unit selection threshold to render more accurate
values.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
  drivers/gpu/drm/drm_file.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 762965e3d503..34cfa128ffe5 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct 
drm_pending_event *e)

  }
  EXPORT_SYMBOL(drm_send_event);
+#define UPPER_UNIT_THRESHOLD 100
+
  static void print_size(struct drm_printer *p, const char *stat,
 const char *region, u64 sz)
  {
@@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p, 
const char *stat,

  unsigned u;
  for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
-    if (sz < SZ_1K)
+    if ((sz & (SZ_1K - 1)) &&


IS_ALIGNED worth it at all?


+    sz < UPPER_UNIT_THRESHOLD * SZ_1K)
  break;


Excuse me for a late comment (I was away). I did not get what what is 
special about a ~10% threshold? Sounds to me just going with the lower 
unit, when size is not aligned to the higher one, would be better than 
sometimes precision-sometimes-not.


FWIW both current and the threshold option make testing the feature very 
annoying.


So I'd really propose we simply use smaller unit when unaligned.

Regards,

Tvrtko


Re: [PATCH v6 4/6] drm/drm_file: Add DRM obj's RSS reporting function for fdinfo

2023-09-20 Thread Tvrtko Ursulin



On 20/09/2023 00:34, Adrián Larumbe wrote:

Some BO's might be mapped onto physical memory chunkwise and on demand,
like Panfrost's tiler heap. In this case, even though the
drm_gem_shmem_object page array might already be allocated, only a very
small fraction of the BO is currently backed by system memory, but
drm_show_memory_stats will then proceed to add its entire virtual size to
the file's total resident size regardless.

This led to very unrealistic RSS sizes being reckoned for Panfrost, where
said tiler heap buffer is initially allocated with a virtual size of 128
MiB, but only a small part of it will eventually be backed by system memory
after successive GPU page faults.

Provide a new DRM object generic function that would allow drivers to
return a more accurate RSS size for their BOs.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
  drivers/gpu/drm/drm_file.c | 5 -
  include/drm/drm_gem.h  | 9 +
  2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 883d83bc0e3d..762965e3d503 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -944,7 +944,10 @@ void drm_show_memory_stats(struct drm_printer *p, struct 
drm_file *file)
}
  
  		if (s & DRM_GEM_OBJECT_RESIDENT) {

-   status.resident += obj->size;
+   if (obj->funcs && obj->funcs->rss)
+   status.resident += obj->funcs->rss(obj);
+   else
+   status.resident += obj->size;


Presumably you'd want the same smaller size in both active and 
purgeable? Or you can end up with more in those two than in rss which 
would look odd.


Also, alternative to adding a new callback could be adding multiple 
output parameters to the existing obj->func->status() which maybe ends 
up simpler due fewer callbacks?


Like:

 s = obj->funcs->status(obj, _status, )

And adjust the code flow to pick up the rss if driver signaled it 
supports reporting it.


Regards,

Tvrtko


} else {
/* If already purged or not yet backed by pages, don't
 * count it as purgeable:
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index bc9f6aa2f3fe..16364487fde9 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -208,6 +208,15 @@ struct drm_gem_object_funcs {
 */
enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
  
+	/**

+* @rss:
+*
+* Return resident size of the object in physical memory.
+*
+* Called by drm_show_memory_stats().
+*/
+   size_t (*rss)(struct drm_gem_object *obj);
+
/**
 * @vm_ops:
 *


Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics

2023-09-20 Thread Tvrtko Ursulin



On 20/09/2023 00:34, Adrián Larumbe wrote:

The drm-stats fdinfo tags made available to user space are drm-engine,
drm-cycles, drm-max-freq and drm-curfreq, one per job slot.

This deviates from standard practice in other DRM drivers, where a single
set of key:value pairs is provided for the whole render engine. However,
Panfrost has separate queues for fragment and vertex/tiler jobs, so a
decision was made to calculate bus cycles and workload times separately.

Maximum operating frequency is calculated at devfreq initialisation time.
Current frequency is made available to user space because nvtop uses it
when performing engine usage calculations.

It is important to bear in mind that both GPU cycle and kernel time numbers
provided are at best rough estimations, and always reported in excess from
the actual figure because of two reasons:
  - Excess time because of the delay between the end of a job processing,
the subsequent job IRQ and the actual time of the sample.
  - Time spent in the engine queue waiting for the GPU to pick up the next
job.

To avoid race conditions during enablement/disabling, a reference counting
mechanism was introduced, and a job flag that tells us whether a given job
increased the refcount. This is necessary, because user space can toggle
cycle counting through a debugfs file, and a given job might have been in
flight by the time cycle counting was disabled.

The main goal of the debugfs cycle counter knob is letting tools like nvtop
or IGT's gputop switch it at any time, to avoid power waste in case no
engine usage measuring is necessary.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
  drivers/gpu/drm/panfrost/Makefile   |  2 +
  drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 
  drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 +
  drivers/gpu/drm/panfrost/panfrost_devfreq.c |  8 +++
  drivers/gpu/drm/panfrost/panfrost_devfreq.h |  3 ++
  drivers/gpu/drm/panfrost/panfrost_device.c  |  2 +
  drivers/gpu/drm/panfrost/panfrost_device.h  | 13 +
  drivers/gpu/drm/panfrost/panfrost_drv.c | 57 -
  drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++
  drivers/gpu/drm/panfrost/panfrost_gpu.h |  4 ++
  drivers/gpu/drm/panfrost/panfrost_job.c | 24 +
  drivers/gpu/drm/panfrost/panfrost_job.h |  5 ++
  12 files changed, 191 insertions(+), 1 deletion(-)
  create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c
  create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h

diff --git a/drivers/gpu/drm/panfrost/Makefile 
b/drivers/gpu/drm/panfrost/Makefile
index 7da2b3f02ed9..2c01c1e7523e 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -12,4 +12,6 @@ panfrost-y := \
panfrost_perfcnt.o \
panfrost_dump.o
  
+panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o

+
  obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c 
b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
new file mode 100644
index ..cc14eccba206
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2023 Collabora ltd. */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "panfrost_device.h"
+#include "panfrost_gpu.h"
+#include "panfrost_debugfs.h"
+
+void panfrost_debugfs_init(struct drm_minor *minor)
+{
+   struct drm_device *dev = minor->dev;
+   struct panfrost_device *pfdev = 
platform_get_drvdata(to_platform_device(dev->dev));
+
+   debugfs_create_atomic_t("profile", 0600, minor->debugfs_root, 
>profile_mode);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h 
b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
new file mode 100644
index ..db1c158bcf2f
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2023 Collabora ltd.
+ */
+
+#ifndef PANFROST_DEBUGFS_H
+#define PANFROST_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+void panfrost_debugfs_init(struct drm_minor *minor);
+#endif
+
+#endif  /* PANFROST_DEBUGFS_H */
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c 
b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 58dfb15a8757..28caffc689e2 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct device *dev,
spin_lock_irqsave(>lock, irqflags);
  
  	panfrost_devfreq_update_utilization(pfdevfreq);

+   pfdevfreq->current_frequency = status->current_frequency;
  
  	status->total_time = ktime_to_ns(ktime_add(pfdevfreq->busy_time,

   pfdevfreq->idle_time));
@@ -117,6 +118,7 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
struct devfreq *devfreq;
struct 

Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats

2023-09-20 Thread Tvrtko Ursulin



On 20/09/2023 00:34, Adrián Larumbe wrote:

The current implementation will try to pick the highest available size
display unit as soon as the BO size exceeds that of the previous
multiplier. That can lead to loss of precision in contexts of low memory
usage.

The new selection criteria try to preserve precision, whilst also
increasing the display unit selection threshold to render more accurate
values.

Signed-off-by: Adrián Larumbe 
Reviewed-by: Boris Brezillon 
Reviewed-by: Steven Price 
---
  drivers/gpu/drm/drm_file.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 762965e3d503..34cfa128ffe5 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct 
drm_pending_event *e)
  }
  EXPORT_SYMBOL(drm_send_event);
  
+#define UPPER_UNIT_THRESHOLD 100

+
  static void print_size(struct drm_printer *p, const char *stat,
   const char *region, u64 sz)
  {
@@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p, const char 
*stat,
unsigned u;
  
  	for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {

-   if (sz < SZ_1K)
+   if ((sz & (SZ_1K - 1)) &&


IS_ALIGNED worth it at all?


+   sz < UPPER_UNIT_THRESHOLD * SZ_1K)
break;


Excuse me for a late comment (I was away). I did not get what what is 
special about a ~10% threshold? Sounds to me just going with the lower 
unit, when size is not aligned to the higher one, would be better than 
sometimes precision-sometimes-not.


Regards,

Tvrtko


sz = div_u64(sz, SZ_1K);
}


Re: [Intel-gfx] [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing

2023-09-20 Thread Tvrtko Ursulin



On 24/08/2023 12:35, Upadhyay, Tejas wrote:

-Original Message-
From: Intel-gfx  On Behalf Of Tvrtko
Ursulin
Sent: Friday, July 7, 2023 6:32 PM
To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org
Subject: [Intel-gfx] [PATCH 5/5] drm/i915: Implement fdinfo memory stats
printing

From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists and
accumulate object size into the respective drm_memory_stats categories.

Objects with multiple possible placements are reported in multiple regions for
total and shared sizes, while other categories are counted only for the
currently active region.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
---
  drivers/gpu/drm/i915/i915_drm_client.c | 85 ++
  1 file changed, 85 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c
b/drivers/gpu/drm/i915/i915_drm_client.c
index ffccb6239789..5c77d6987d90 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref)  }

  #ifdef CONFIG_PROC_FS
+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) {
+   struct intel_memory_region *mr;
+   u64 sz = obj->base.size;
+   enum intel_region_id id;
+   unsigned int i;
+
+   /* Attribute size and shared to all possible memory regions. */
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   mr = obj->mm.placements[i];
+   id = mr->id;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   /* Attribute other categories to only the current region. */
+   mr = obj->mm.region;
+   if (mr)
+   id = mr->id;
+   else
+   id = INTEL_REGION_SMEM;
+
+   if (!obj->mm.n_placements) {
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   dma_resv_usage_rw(true)))


Should not DMA_RESV_USAGE_BOOKKEEP also considered active (why only "rw")? Some 
app is syncing with syncjobs and has added dma_fence with DMA_RESV_USAGE_BOOKKEEP during 
execbuf while that BO is busy on waiting on work!


Hmm do we have a path which adds DMA_RESV_USAGE_BOOKKEEP usage in execbuf?

Rob, any comments here? Given how I basically lifted the logic from 
686b21b5f6ca ("drm: Add fdinfo memory stats"), does it sound plausible 
to upgrade the test against all fences?


Regards,

Tvrtko


+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry (>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, >objects_list) {
+   obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj),
+client_link));
+   if (!obj)
+   continue;
+   obj_meminfo(obj, stats);
+   i915_gem_object_put(obj);
+   }
+   rcu_read_unlock();
+
+   for_each_memory_region(mr, i915, id)
+   drm_print_memory_stats(p,
+  [id],
+  DRM_GEM_OBJECT_RESIDENT |
+  DRM_GEM_OBJECT_PURGEABLE,
+  mr->name);
+}
+
  static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_RENDER] = "render",
[I915_ENGINE_CLASS_COPY

Re: [PATCH v2] drm: Update file owner during use

2023-09-20 Thread Tvrtko Ursulin



On 28/08/2023 20:58, Rob Clark wrote:

On Wed, Jun 21, 2023 at 2:48 AM Tvrtko Ursulin
 wrote:


From: Tvrtko Ursulin 

With the typical model where the display server opens the file descriptor
and then hands it over to the client(*), we were showing stale data in
debugfs.

Fix it by updating the drm_file->pid on ioctl access from a different
process.

The field is also made RCU protected to allow for lockless readers. Update
side is protected with dev->filelist_mutex.

Before:

$ cat /sys/kernel/debug/dri/0/clients
  command   pid dev master a   uid  magic
 Xorg  2344   0   yy 0  0
 Xorg  2344   0   ny 0  2
 Xorg  2344   0   ny 0  3
 Xorg  2344   0   ny 0  4

After:

$ cat /sys/kernel/debug/dri/0/clients
  command  tgid dev master a   uid  magic
 Xorg   830   0   yy 0  0
xfce4-session   880   0   ny 0  1
xfwm4   943   0   ny 0  2
neverball  1095   0   ny 0  3

*)
More detailed and historically accurate description of various handover
implementation kindly provided by Emil Velikov:

"""
The traditional model, the server was the orchestrator managing the
primary device node. From the fd, to the master status and
authentication. But looking at the fd alone, this has varied across
the years.

IIRC in the DRI1 days, Xorg (libdrm really) would have a list of open
fd(s) and reuse those whenever needed, DRI2 the client was responsible
for open() themselves and with DRI3 the fd was passed to the client.

Around the inception of DRI3 and systemd-logind, the latter became
another possible orchestrator. Whereby Xorg and Wayland compositors
could ask it for the fd. For various reasons (hysterical and genuine
ones) Xorg has a fallback path going the open(), whereas Wayland
compositors are moving to solely relying on logind... some never had
fallback even.

Over the past few years, more projects have emerged which provide
functionality similar (be that on API level, Dbus, or otherwise) to
systemd-logind.
"""

v2:
  * Fixed typo in commit text and added a fine historical explanation
    from Emil.

Signed-off-by: Tvrtko Ursulin 
Cc: "Christian König" 
Cc: Daniel Vetter 
Acked-by: Christian König 
Reviewed-by: Emil Velikov 


Reviewed-by: Rob Clark 
Tested-by: Rob Clark 


Thanks. If everyone else is happy with this approach I don't have the 
commit rights for drm-misc.


Regards,

Tvrtko




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c |  6 ++--
  drivers/gpu/drm/drm_auth.c  |  3 +-
  drivers/gpu/drm/drm_debugfs.c   | 10 ---
  drivers/gpu/drm/drm_file.c  | 40 +++--
  drivers/gpu/drm/drm_ioctl.c |  3 ++
  drivers/gpu/drm/nouveau/nouveau_drm.c   |  5 +++-
  drivers/gpu/drm/vmwgfx/vmwgfx_gem.c |  6 ++--
  include/drm/drm_file.h  | 13 ++--
  8 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 74055cba3dc9..849097dff02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -963,6 +963,7 @@ static int amdgpu_debugfs_gem_info_show(struct seq_file *m, 
void *unused)
 list_for_each_entry(file, >filelist, lhead) {
 struct task_struct *task;
 struct drm_gem_object *gobj;
+   struct pid *pid;
 int id;

 /*
@@ -972,8 +973,9 @@ static int amdgpu_debugfs_gem_info_show(struct seq_file *m, 
void *unused)
  * Therefore, we need to protect this ->comm access using RCU.
  */
 rcu_read_lock();
-   task = pid_task(file->pid, PIDTYPE_TGID);
-   seq_printf(m, "pid %8d command %s:\n", pid_nr(file->pid),
+   pid = rcu_dereference(file->pid);
+   task = pid_task(pid, PIDTYPE_TGID);
+   seq_printf(m, "pid %8d command %s:\n", pid_nr(pid),
task ? task->comm : "");
 rcu_read_unlock();

diff --git a/drivers/gpu/drm/drm_auth.c b/drivers/gpu/drm/drm_auth.c
index cf92a9ae8034..2ed2585ded37 100644
--- a/drivers/gpu/drm/drm_auth.c
+++ b/drivers/gpu/drm/drm_auth.c
@@ -235,7 +235,8 @@ static int drm_new_set_master(struct drm_device *dev, 
struct drm_file *fpriv)
  static int
  drm_master_check_perm(struct drm_device *dev, struct drm_file *file_priv)
  {
-   if (file_priv->pid == task_pid(current) && file_priv->was_master)
+   if (file_priv->was_master &&
+   rcu_access_pointer(file_priv->pid) == task_pid(current))
 return 0;

 if (!capable(CAP_SYS_

Re: [Intel-gfx] [PATCH] drm/i915: Do not disable preemption for resets

2023-09-20 Thread Tvrtko Ursulin



On 13/09/2023 18:04, Valentin Schneider wrote:

On Wed, 13 Sept 2023 at 18:48, Sebastian Andrzej Siewior
 wrote:


On 2023-07-05 10:30:25 [+0100], Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a
preempt disable section over the hardware reset callback to prepare the
driver for being able to reset from atomic contexts.

…

This missed the v6.6 merge window. Has this been dropped for some reason
or just missed by chance? Can this be still applied, please?



Just an FYI, but I happened to be looking at an internal bug report
for exactly this
error site, so +1 here :)


It looks I failed to collect an r-b before the summer break and so it 
fell off my radar. Definitely want to merge it so I will try again.


Regards,

Tvrtko


[PATCH] drm/i915: Zap some empty lines

2023-09-20 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Recent refactoring left an unsightly block of empty lines. Remove them.

Signed-off-by: Tvrtko Ursulin 
Cc: Dnyaneshwar Bhadane 
Cc: Anusha Srivatsa 
Cc: Radhakrishna Sripada 
---
 drivers/gpu/drm/i915/i915_drv.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 87ffc477c3b1..511eba3bbdba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -646,13 +646,6 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 #define IS_TIGERLAKE_UY(i915) \
IS_SUBPLATFORM(i915, INTEL_TIGERLAKE, INTEL_SUBPLATFORM_UY)
 
-
-
-
-
-
-
-
 #define IS_XEHPSDV_GRAPHICS_STEP(__i915, since, until) \
(IS_XEHPSDV(__i915) && IS_GRAPHICS_STEP(__i915, since, until))
 
-- 
2.39.2



Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread

2023-08-03 Thread Tvrtko Ursulin



On 03/08/2023 15:43, Matthew Brost wrote:

On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:


On 01/08/2023 21:50, Matthew Brost wrote:

In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
seems a bit odd but let us explain the reasoning below.

1. In XE the submission order from multiple drm_sched_entity is not
guaranteed to be the same completion even if targeting the same hardware
engine. This is because in XE we have a firmware scheduler, the GuC,
which allowed to reorder, timeslice, and preempt submissions. If a using
shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
apart as the TDR expects submission order == completion order. Using a
dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.

2. In XE submissions are done via programming a ring buffer (circular
buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
control on the ring for free.

A problem with this design is currently a drm_gpu_scheduler uses a
kthread for submission / job cleanup. This doesn't scale if a large
number of drm_gpu_scheduler are used. To work around the scaling issue,
use a worker rather than kthread for submission / job cleanup.

v2:
- (Rob Clark) Fix msm build
- Pass in run work queue
v3:
- (Boris) don't have loop in worker

Signed-off-by: Matthew Brost 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  14 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  14 +-
   drivers/gpu/drm/etnaviv/etnaviv_sched.c |   2 +-
   drivers/gpu/drm/lima/lima_sched.c   |   2 +-
   drivers/gpu/drm/msm/adreno/adreno_device.c  |   6 +-
   drivers/gpu/drm/msm/msm_ringbuffer.c|   2 +-
   drivers/gpu/drm/panfrost/panfrost_job.c |   2 +-
   drivers/gpu/drm/scheduler/sched_main.c  | 136 +++-
   drivers/gpu/drm/v3d/v3d_sched.c |  10 +-
   include/drm/gpu_scheduler.h |  14 +-
   10 files changed, 113 insertions(+), 89 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index f60753f97ac5..9c2a10aeb0b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
-   if (!ring || !ring->sched.thread)
+   if (!ring || !ring->sched.ready)
continue;
-   kthread_park(ring->sched.thread);
+   drm_sched_run_wq_stop(>sched);


It would be good to split out adding of these wrappers (including adding one
for ring->sched.thread/ready) to a standalong preceding patch. That way at
least some mechanical changes to various drivers would be separated from
functional changes.



Sure.
  

Also, perhaps do not have the wq in the name if it is not really needed to
be verbose with the underlying implementation like that? Like would
drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea.



Sure.
  

}
seq_printf(m, "run ib test:\n");
@@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
-   if (!ring || !ring->sched.thread)
+   if (!ring || !ring->sched.ready)
continue;
-   kthread_unpark(ring->sched.thread);
+   drm_sched_run_wq_start(>sched);
}
up_write(>reset_domain->sem);
@@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
ring = adev->rings[val];
-   if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)
+   if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
return -EINVAL;
/* the last preemption failed */
@@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
goto pro_end;
/* stop the scheduler */
-   kthread_park(ring->sched.thread);
+   drm_sched_run_wq_stop(>sched);
/* preempt the IB */
r = amdgpu_ring_preempt_ib(ring);
@@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
   failure:
/* restart the scheduler */
-   kthread_unpark(ring->sched.thread);
+   drm_sched_run_wq_start(>sched);
up_read(>reset_domain->sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fac9312b1695..00c9c03c8f94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amd

Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread

2023-08-03 Thread Tvrtko Ursulin



On 03/08/2023 15:56, Christian König wrote:

Am 03.08.23 um 16:43 schrieb Matthew Brost:

On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote:

On 01/08/2023 21:50, Matthew Brost wrote:
[SNIP]

   sched->ops = ops;
   sched->hw_submission_limit = hw_submission;
   sched->name = name;
+    sched->run_wq = run_wq ? : system_wq;
I still think it is not nice to implicitly move everyone over to the 
shared
system wq. Maybe even more so with now one at a time execution, since 
effect

on latency can be even greater.


No one that has a stake in this has pushed back that I can recall. Open
to feedback stakeholders (maintainers of drivers that use the drm
scheduler).

>
No objections to using the system_wq here. Drivers can still pass in 
their own or simply use system_highpri_wq instead.


Additional to that the system_wq isn't single threaded, it will create 
as much threads as needed to fully utilize all CPUs.



  The i915 doesn't use the DRM scheduler last time I looked.
Has that changed?
Have you considered kthread_work as a backend? Maybe it would work to 
have
callers pass in a kthread_worker they create, or provide a drm_sched 
helper

to create one, which would then be passed to drm_sched_init.

That would enable per driver kthread_worker, or per device, or whatever
granularity each driver would want/need/desire.

driver init:
struct drm_sched_worker = drm_sched_create_worker(...);

queue/whatever init:
drm_sched_init(.., worker, ...);


This idea doesn't seem to work for varitey of reasons. Will type it out
if needed but not going to spend time on this unless someone with a
stake raises this as an issue.


Agree completely. kthread_work is for real time workers IIRC.


AFAIK it is indicated if one needs to tweak the kthread priority, but 
that is not the only use case.


I am curious to know why the idea does not work for variety of reasons.


You could create one inside drm_sched_init if not passed in, which would
keep the behaviour for existing drivers more similar - they would 
still have

a 1:1 kthread context for their exclusive use.


Part of the idea of a work queue is so a user can't directly create a
kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes
this issue.


Yeah, prevent that is indeed a very good idea.


Nope, I wasn't suggesting that at all.

I was suggesting as many kthread_workers (these are threads) as the 
implementation wants. Xe can create one per device. Someone else can 
create one per hw engine, whatever.


One kthread_*work* per entity does not mean one thread per 
XE_EXEC_QUEUE_CREATE. Kthread_work is just a unit of work executed by 
the kthread_worker thread. Same in that conceptual relationship as 
workqueue and workitem.


Difference is it may work better for single-shot re-arming design if 
regression in submission latency concerns any stakeholders.


And I *think* self-re-arming would be less problematic latency wise 
since
kthread_worker consumes everything queued without relinquishing 
control and
execution context would be guaranteed not to be shared with random 
system

stuff.


So this is essentially so we can use a loop? Seems like a lot effort for
what is pure speculation. Again if a stakeholder raises an issue we can
address then.


Instead of a loop what you usually do in the worker is to submit one 
item (if possible) and then re-queue yourself if there is more work to do.


This way you give others chance to run as well and/or cancel the work 
etc...


Yeah I was pointing out loop in the worker was bad months ago (or more) 
so it is not about that. Here my point is whether it can be done better 
than silently convert everyone to system_wq.


Hence my proposal is to *keep* closer to the thread semantics for 
everyone and at the same time _allow_ the option of custom 
workqueue/whatever.


Where is the problem there?

Regards,

Tvrtko


Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread

2023-08-03 Thread Tvrtko Ursulin



On 01/08/2023 21:50, Matthew Brost wrote:

In XE, the new Intel GPU driver, a choice has made to have a 1 to 1
mapping between a drm_gpu_scheduler and drm_sched_entity. At first this
seems a bit odd but let us explain the reasoning below.

1. In XE the submission order from multiple drm_sched_entity is not
guaranteed to be the same completion even if targeting the same hardware
engine. This is because in XE we have a firmware scheduler, the GuC,
which allowed to reorder, timeslice, and preempt submissions. If a using
shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls
apart as the TDR expects submission order == completion order. Using a
dedicated drm_gpu_scheduler per drm_sched_entity solve this problem.

2. In XE submissions are done via programming a ring buffer (circular
buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the
limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow
control on the ring for free.

A problem with this design is currently a drm_gpu_scheduler uses a
kthread for submission / job cleanup. This doesn't scale if a large
number of drm_gpu_scheduler are used. To work around the scaling issue,
use a worker rather than kthread for submission / job cleanup.

v2:
   - (Rob Clark) Fix msm build
   - Pass in run work queue
v3:
   - (Boris) don't have loop in worker

Signed-off-by: Matthew Brost 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  14 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  14 +-
  drivers/gpu/drm/etnaviv/etnaviv_sched.c |   2 +-
  drivers/gpu/drm/lima/lima_sched.c   |   2 +-
  drivers/gpu/drm/msm/adreno/adreno_device.c  |   6 +-
  drivers/gpu/drm/msm/msm_ringbuffer.c|   2 +-
  drivers/gpu/drm/panfrost/panfrost_job.c |   2 +-
  drivers/gpu/drm/scheduler/sched_main.c  | 136 +++-
  drivers/gpu/drm/v3d/v3d_sched.c |  10 +-
  include/drm/gpu_scheduler.h |  14 +-
  10 files changed, 113 insertions(+), 89 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index f60753f97ac5..9c2a10aeb0b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
  
-		if (!ring || !ring->sched.thread)

+   if (!ring || !ring->sched.ready)
continue;
-   kthread_park(ring->sched.thread);
+   drm_sched_run_wq_stop(>sched);


It would be good to split out adding of these wrappers (including adding 
one for ring->sched.thread/ready) to a standalong preceding patch. That 
way at least some mechanical changes to various drivers would be 
separated from functional changes.


Also, perhaps do not have the wq in the name if it is not really needed 
to be verbose with the underlying implementation like that? Like would 
drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea.



}
  
  	seq_printf(m, "run ib test:\n");

@@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
  
-		if (!ring || !ring->sched.thread)

+   if (!ring || !ring->sched.ready)
continue;
-   kthread_unpark(ring->sched.thread);
+   drm_sched_run_wq_start(>sched);
}
  
  	up_write(>reset_domain->sem);

@@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
  
  	ring = adev->rings[val];
  
-	if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread)

+   if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready)
return -EINVAL;
  
  	/* the last preemption failed */

@@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
goto pro_end;
  
  	/* stop the scheduler */

-   kthread_park(ring->sched.thread);
+   drm_sched_run_wq_stop(>sched);
  
  	/* preempt the IB */

r = amdgpu_ring_preempt_ib(ring);
@@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
  
  failure:

/* restart the scheduler */
-   kthread_unpark(ring->sched.thread);
+   drm_sched_run_wq_start(>sched);
  
  	up_read(>reset_domain->sem);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fac9312b1695..00c9c03c8f94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
break;
}
  
-		r = drm_sched_init(>sched, _sched_ops,

+   

Re: [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing

2023-08-03 Thread Tvrtko Ursulin



On 03/08/2023 06:15, Iddamsetty, Aravind wrote:

On 27-07-2023 15:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

Objects with multiple possible placements are reported in multiple
regions for total and shared sizes, while other categories are
counted only for the currently active region.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark > ---
  drivers/gpu/drm/i915/i915_drm_client.c | 85 ++
  1 file changed, 85 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index a61356012df8..9e7a6075ee25 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref)
  }
  
  #ifdef CONFIG_PROC_FS

+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN])
+{
+   struct intel_memory_region *mr;
+   u64 sz = obj->base.size;
+   enum intel_region_id id;
+   unsigned int i;
+
+   /* Attribute size and shared to all possible memory regions. */
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   mr = obj->mm.placements[i];
+   id = mr->id;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   /* Attribute other categories to only the current region. */
+   mr = obj->mm.region;
+   if (mr)
+   id = mr->id;
+   else
+   id = INTEL_REGION_SMEM;
+
+   if (!obj->mm.n_placements) {


I guess we do not expect to have n_placements set to public objects, is
that right?


I think they are the only ones which can have placements. It is via 
I915_GEM_CREATE_EXT_MEMORY_REGIONS userspace is able to create them.


My main conundrum in this patch is a few lines above, the loop which 
adds shared and private.


Question is, if an object can be either smem or lmem, how do we want to 
report it? This patch adds the size for all possible regions and 
resident and active only to the currently active. But perhaps that is 
wrong. Maybe I should change it is only against the active region and 
multiple regions are just ignored. Then if object is migrated do access 
patterns or memory pressure, the total size would migrate too.


I think I was trying to achieve something here (have more visibility on 
what kind of backing store clients are allocating) which maybe does not 
work to well with the current categories.


Namely if userspace allocates say one 1MiB object with placement in 
either smem or lmem, and it is currently resident in lmem, I wanted it 
to show as:


 total-smem: 1 MiB
 resident-smem: 0
 total-lmem: 1 MiB
 resident-lmem: 1 MiB

To constantly show how in theory client could be using memory from 
either region. Maybe that is misleading and should instead be:


 total-smem: 0
 resident-smem: 0
 total-lmem: 1 MiB
 resident-lmem: 1 MiB

?

And then if/when the same object gets migrated to smem it changes to 
(lets assume it is also not resident any more but got swapped out):


 total-smem: 1 MiB
 resident-smem: 0
 total-lmem: 0
 resident-lmem: 0

Regards,

Tvrtko


+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   dma_resv_usage_rw(true)))
+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry(>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, &g

[PULL] drm-intel-fixes

2023-08-03 Thread Tvrtko Ursulin
Hi Dave, Daniel,

Some fixes for the 6.5 RC this week: one for GVT display I2C handling,
which came via gvt-fixes merge, one for premature freeing of request
memory, and finally one fix for Gen12 AUX invalidatation flow to correctly
align it with the documented sequence.

Regards,

Tvrtko

drm-intel-fixes-2023-08-03:
- Fix bug in getting msg length in AUX CH registers handler [gvt] (Yan Zhao)
- Gen12 AUX invalidation fixes [gt] (Andi Shyti, Jonathan Cavitt)
- Fix premature release of request's reusable memory (Janusz Krzysztofik)

- Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into 
drm-intel-fixes (Tvrtko Ursulin)
The following changes since commit 5d0c230f1de8c7515b6567d9afba1f196fb4e2f4:

  Linux 6.5-rc4 (2023-07-30 13:23:47 -0700)

are available in the Git repository at:

  git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-08-03

for you to fetch changes up to 0bc057eae2610c275361766a064a23cc2758f3ff:

  Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into 
drm-intel-fixes (2023-08-02 08:14:57 +0100)


- Fix bug in getting msg length in AUX CH registers handler [gvt] (Yan Zhao)
- Gen12 AUX invalidation fixes [gt] (Andi Shyti, Jonathan Cavitt)
- Fix premature release of request's reusable memory (Janusz Krzysztofik)

- Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into 
drm-intel-fixes (Tvrtko Ursulin)


Andi Shyti (5):
  drm/i915/gt: Cleanup aux invalidation registers
  drm/i915: Add the gen12_needs_ccs_aux_inv helper
  drm/i915/gt: Rename flags with bit_group_X according to the datasheet
  drm/i915/gt: Enable the CCS_FLUSH bit in the pipe control and in the CS
  drm/i915/gt: Support aux invalidation on all engines

Janusz Krzysztofik (1):
  drm/i915: Fix premature release of request's reusable memory

Jonathan Cavitt (2):
  drm/i915/gt: Ensure memory quiesced before invalidation
  drm/i915/gt: Poll aux invalidation register bit on invalidation

Tvrtko Ursulin (1):
  Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux 
into drm-intel-fixes

Yan Zhao (1):
  drm/i915/gvt: Fix bug in getting msg length in AUX CH registers handler

 drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 140 ++-
 drivers/gpu/drm/i915/gt/gen8_engine_cs.h |  21 ++--
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |   2 +
 drivers/gpu/drm/i915/gt/intel_gt_regs.h  |  16 +--
 drivers/gpu/drm/i915/gt/intel_lrc.c  |  17 +---
 drivers/gpu/drm/i915/gvt/edid.c  |   2 +-
 drivers/gpu/drm/i915/i915_active.c   |  99 +--
 drivers/gpu/drm/i915/i915_request.c  |  11 +++
 8 files changed, 199 insertions(+), 109 deletions(-)


Re: [PATCH 16/17] cgroup/drm: Expose memory stats

2023-07-28 Thread Tvrtko Ursulin



One additional thought on one sub-topic:

On 27/07/2023 18:08, Tvrtko Ursulin wrote:

[snip]

For something like this,  you would probably want it to work inside 
the drm scheduler first. Presumably, this can be done by setting a 
weight on each runqueue, and perhaps adding a callback to update one 
for a running queue. Calculating the weights hierarchically might be 
fun..


It is not needed to work in drm scheduler first. In fact drm 
scheduler based drivers can plug into what I have since it already 
has the notion of scheduling priorities.


They would only need to implement a hook which allow the cgroup 
controller to query client GPU utilisation and another to received 
the over budget signal.


Amdgpu and msm AFAIK could be easy candidates because they both 
support per client utilisation and priorities.


Looks like I need to put all this info back into the cover letter.

Also, hierarchic weights and time budgets are all already there. What 
could be done later is make this all smarter and respect the time 
budget with more precision. That would however, in many cases 
including Intel, require co-operation with the firmware. In any case 
it is only work in the implementation, while the cgroup control 
interface remains the same.


I have taken a look at how the rest of cgroup controllers change 
ownership when moved to a different cgroup, and the answer was: not 
at all. If we attempt to create the scheduler controls only on the 
first time the fd is used, you could probably get rid of all the 
tracking.


Can you send a CPU file descriptor from process A to process B and 
have CPU usage belonging to process B show up in process' A cgroup, 
or vice-versa? Nope, I am not making any sense, am I? My point being 
it is not like-to-like, model is different.


No ownership transfer would mean in wide deployments all GPU 
utilisation would be assigned to Xorg and so there is no point to any 
of this. No way to throttle a cgroup with un-important GPU clients 
for instance.
If you just grab the current process' cgroup when a drm_sched_entity 
is created, you don't have everything charged to X.org. No need for 
complicated ownership tracking in drm_file. The same equivalent should 
be done in i915 as well when a context is created as it's not using 
the drm scheduler.


Okay so essentially nuking the concept of DRM clients belongs to one 
cgroup and instead tracking at the context level. That is an interesting 
idea. I suspect implementation could require somewhat generalizing the 
concept of an "execution context", or at least expressing it via the DRM 
cgroup controller.


I can give this a spin, or at least some more detailed thought, once we 
close on a few more details regarding charging in general.


I didn't get much time to brainstorm this just yet, only one downside 
randomly came to mind later - with this approach for i915 we wouldn't 
correctly attribute any GPU activity done in the receiving process 
against our default contexts. Those would still be accounted to the 
sending process.


How much problem in practice that would be remains to be investigated, 
including if it applies to other drivers too. If there is a good amount 
of deployed userspace which use the default context, then it would be a 
bit messy.


Regards,

Tvrtko

*) For non DRM and non i915 people, default context is a GPU submission 
context implicitly created during the device node open. It always 
remains valid, including in the receiving process if SCM_RIGHTS is used.


Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 08:14, Yang, Fei wrote:

[snip]

@@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
   return false;

   /*
-  * For objects created by userspace through GEM_CREATE with pat_index
-  * set by set_pat extension, i915_gem_object_has_cache_level() will
-  * always return true, because the coherency of such object is managed


i915_gem_object_has_cache_level() always return true means this function
always return false.


-  * by userspace. Othereise the call here would fall back to checking
-  * whether the object is un-cached or write-through.
+  * Always flush cache for UMD objects with PAT index set.


(obj->pat_set_by_user == true) indicates UMD knows how to handle the coherency,
forcing clflush in KMD would be redundant.


For Meteorlake I made gpu_write_needs_clflush() always return false anyway.

Could you please submit a patch with kerneldoc for i915_drm.h explaining 
what the set domain ioctl is expected to do when set pat extension is 
used? With the focus on the use cases of how userspace is managing 
coherency using it, or it isn't, or what.



*/
- return !(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) ||
-  i915_gem_object_has_cache_level(obj, I915_CACHE_WT));
+ if (obj->pat_set_by_user)
+ return true;


return false;


Oops, thank you! I did warn in the cover letter I was getting confused 
by boolean logic conversions, cross-referencing three versions, and 
extracting the pat_set_by_user to call sites. :)



+
+ /*
+  * Fully coherent cached access may end up with data in the CPU cache
+  * which hasn't hit memory yet.
+  */
+ return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) &&
+i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W);


Why checking COH2W here? The logic was, if UC or WT return false, otherwise
return true. So, as long as cache_mode is WB, it's sufficient to say true
here, right?


I was trying to penetrate the reason behind the check.

Original code was:

   return !(obj->cache_level == I915_CACHE_NONE ||
obj->cache_level == I915_CACHE_WT);

Which is equivalent to "is it WB", right? (Since it matches on both old 
LLC flavours.)


Which I thought, in the context of this function, is supposed to answer 
the question of "can there be data in the shared cache written by the 
GPU but not committed to RAM yet".


And then I thought that can only ever happen with 2-way coherency. 
Otherwise GPU writes never end up in the CPU cache.


Did I get that wrong? Maybe I have..

Regards,

Tvrtko


Re: [RFC 7/8] drm/i915: Lift the user PAT restriction from use_cpu_reloc

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 01:09, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:55:03PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, we can
refine the check in use_cpu_reloc() to not reject the uncached PAT if it
was set by userspace.

Instead it can decide based on the presence of full coherency which
should be functionally equivalent on legacy platforms. We can ignore WT
since it is only used by the display, and we can ignore Meteorlake since
it will fail on the existing "has_llc" condition before the object cache
mode check.

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 +
  1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 9d6e49c8a4c6..f74b33670bad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -640,16 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache 
*cache,
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
return false;
  
-	/*

-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, i915_gem_object_has_cache_level() always
-* return true, otherwise the call would fall back to checking whether
-* the object is un-cached.
-*/
return (cache->has_llc ||
obj->cache_dirty ||
-   !(obj->pat_set_by_user ||
- i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)));
+   i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W));


My understanding of relocations is minimal, but does 2W actually matter
here (CPU snooping GPU caches)?  I would have expected only 1W coherency
to be necessary (GPU snooping CPU caches)?


I struggled with this one. Original code was:

return (cache->has_llc ||
obj->cache_dirty ||
obj->cache_level != I915_CACHE_NONE);

And I struggled to figure out the intent. It is not "don't do CPU 
relocations for uncached" because it will do them when LLC or dirty 
regardless.


You could be right.. can we interpret it as any mode apart from uncached 
was viewed as coherent for CPU writes being seen by the GPU?


In which case should/could it be based on I915_BO_CACHE_COHERENT_FOR_WRITE?

Regards,

Tvrtko




Matt


  }
  
  static int eb_reserve_vma(struct i915_execbuffer *eb,

--
2.39.2





Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling

2023-07-28 Thread Tvrtko Ursulin



Forgot one part of your reply:

On 28/07/2023 00:57, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few sub-
optimal design decisions which this patch tries to improve upon.

Principal change here is to invert the per platform cache level to PAT
index table which was added by the referenced commit, and by doing so
enable i915 to understand the cache mode between PAT indices, changing
them from opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level and make the involved
code path clearer.

To achieve this we replace the enum i915_cache_level with i915_cache_t,
composed of a more detailed representation of each cache mode (base mode
plus flags).

In this way we are able to express the differences between different
write-back mode coherency settings on Meteorlake, which in turn enables us
to map the i915 "cached" mode to the correct Meteorlake PAT index.

We can also replace the platform dependent cache mode to string code in
debugfs and elsewhere by the single implementation based on i915_cache_t.

v2:
  * Fix PAT-to-cache-mode table for PVC. (Fei)
  * Cache display caching mode too. (Fei)
  * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
  * Checkpath issues.
  * Cache mode flags check fixed.

v4:
  * Fix intel_device_info->cache_modes array size. (Matt)
  * Boolean cache mode and flags query. (Matt)
  * Reduce number of cache macros with some macro magic.
  * One more checkpatch fix.
  * Tweak tables to show legacy and Gen12 WB is fully coherent.

Signed-off-by: Tvrtko Ursulin 
References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/gem/i915_gem_domain.c|  60 +
  drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c|   3 +-
  drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_mman.c  |   4 +-
  drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
  drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  20 +--
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
  .../drm/i915/gem/selftests/huge_gem_object.c  |   2 +-
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |   3 +-
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  10 +-
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  25 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
  drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
  drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
  drivers/gpu/drm/i915/i915_cache.c |  89 +++--
  drivers/gpu/drm/i915/i915_cache.h |  70 ++-
  drivers/gpu/drm/i915/i915_debugfs.c   |  53 ++--
  drivers/gpu/drm/i915/i915_driver.c|   4 +-
  drivers/gpu/drm/i915/i915_gem.c   |  13 --
  drivers/gpu/drm/i915/i915_pci.c   |  84 +++--
  drivers/gpu/drm/i915/i915_perf.c  |   2 +-
  drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
  .../gpu/drm/i915/selftests/i915_gem_evict.c   |   4 +-
  drivers/gpu/drm/i915/selftests/igt_spinner.c  |   2 +-
  .../gpu/drm/i915/selftests/mock_gem_device.c  |  14 +--
  36 files changed, 391 insertions(+), 367 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 57db9c581bf6..c15f83de33af 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -8,6 +8,7 @@
  #include "display/intel_frontbuffer.h"
  #include "gt/intel_gt.h"
  
+#include "i915_cache.h"

  #include "i915_drv.h"
  #include "i915_gem_clflush.h"
  #include "i915_gem_domain.h"
@@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
return false;
  
  	/*

-* For objects created by userspace through GEM_CREATE with pat_index
-* 

Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 01:17, Matt Roper wrote:

On Thu, Jul 27, 2023 at 04:57:53PM -0700, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few sub-
optimal design decisions which this patch tries to improve upon.

Principal change here is to invert the per platform cache level to PAT
index table which was added by the referenced commit, and by doing so
enable i915 to understand the cache mode between PAT indices, changing
them from opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level and make the involved
code path clearer.

To achieve this we replace the enum i915_cache_level with i915_cache_t,
composed of a more detailed representation of each cache mode (base mode
plus flags).

In this way we are able to express the differences between different
write-back mode coherency settings on Meteorlake, which in turn enables us
to map the i915 "cached" mode to the correct Meteorlake PAT index.

We can also replace the platform dependent cache mode to string code in
debugfs and elsewhere by the single implementation based on i915_cache_t.

v2:
  * Fix PAT-to-cache-mode table for PVC. (Fei)
  * Cache display caching mode too. (Fei)
  * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
  * Checkpath issues.
  * Cache mode flags check fixed.

v4:
  * Fix intel_device_info->cache_modes array size. (Matt)
  * Boolean cache mode and flags query. (Matt)
  * Reduce number of cache macros with some macro magic.
  * One more checkpatch fix.
  * Tweak tables to show legacy and Gen12 WB is fully coherent.

Signed-off-by: Tvrtko Ursulin 
References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/gem/i915_gem_domain.c|  60 +
  drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c|   3 +-
  drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_mman.c  |   4 +-
  drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
  drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  20 +--
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
  .../drm/i915/gem/selftests/huge_gem_object.c  |   2 +-
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |   3 +-
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  10 +-
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  25 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
  drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
  drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
  drivers/gpu/drm/i915/i915_cache.c |  89 +++--
  drivers/gpu/drm/i915/i915_cache.h |  70 ++-
  drivers/gpu/drm/i915/i915_debugfs.c   |  53 ++--
  drivers/gpu/drm/i915/i915_driver.c|   4 +-
  drivers/gpu/drm/i915/i915_gem.c   |  13 --
  drivers/gpu/drm/i915/i915_pci.c   |  84 +++--
  drivers/gpu/drm/i915/i915_perf.c  |   2 +-
  drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
  .../gpu/drm/i915/selftests/i915_gem_evict.c   |   4 +-
  drivers/gpu/drm/i915/selftests/igt_spinner.c  |   2 +-
  .../gpu/drm/i915/selftests/mock_gem_device.c  |  14 +--
  36 files changed, 391 insertions(+), 367 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 57db9c581bf6..c15f83de33af 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -8,6 +8,7 @@
  #include "display/intel_frontbuffer.h"
  #include "gt/intel_gt.h"
  
+#include "i915_cache.h"

  #include "i915_drv.h"
  #include "i915_gem_clflush.h"
  #include "i915_gem_domain.h"
@@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
return false;
  
  	/*

-* For objects created by userspace through GEM_CREATE with 

Re: [RFC 5/8] drm/i915: Improve the vm_fault_gtt user PAT index restriction

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 01:04, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:55:01PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, we can
refine the check in vm_fault_gtt() to not reject the uncached PAT if it
was set by userspace on a snoopable platform.

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/gem/i915_gem_mman.c | 14 +++---
  1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index cd7f8ded0d6f..9aa6ecf68432 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -382,17 +382,9 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
goto err_reset;
}
  
-	/*

-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, coherency is managed by userspace, make
-* sure we don't fail handling the vm fault by calling
-* i915_gem_object_has_cache_level() which always return true for such
-* objects. Otherwise this helper function would fall back to checking
-* whether the object is un-cached.
-*/
-   if (!((obj->pat_set_by_user ||
-  i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)) ||
- HAS_LLC(i915))) {
+   /* Access to snoopable pages through the GTT is incoherent. */


This comment was removed in the previous patch, but now it came back
here.  Should we have just left it be in the previous patch?


Oops yes, fumble when splitting the single patch into this series.


I'm not really clear on what it means either.  Are we using "GTT" as
shorthand to refer to the aperture here?


It is about CPU mmap access so I think so.

Original code was:

/* Access to snoopable pages through the GTT is incoherent. */
if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(i915)) {
ret = -EFAULT;
goto err_unpin;
}

Which was disallowing anything not uncached on snoopable platforms. So I 
made it equivalent to that:


/* Access to snoopable pages through the GTT is incoherent. */
if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) &&
!HAS_LLC(i915)) {
ret = -EFAULT;
goto err_unpin;
}

Should be like-for-like assuming PAT-to-cache-mode tables are all good.

On Meteorlake it is no change in behaviour either way due !HAS_LLC.

Regards,

Tvrtko




Matt


+   if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) &&
+   !HAS_LLC(i915)) {
ret = -EFAULT;
goto err_unpin;
}
--
2.39.2





Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 00:57, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few sub-
optimal design decisions which this patch tries to improve upon.

Principal change here is to invert the per platform cache level to PAT
index table which was added by the referenced commit, and by doing so
enable i915 to understand the cache mode between PAT indices, changing
them from opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level and make the involved
code path clearer.

To achieve this we replace the enum i915_cache_level with i915_cache_t,
composed of a more detailed representation of each cache mode (base mode
plus flags).

In this way we are able to express the differences between different
write-back mode coherency settings on Meteorlake, which in turn enables us
to map the i915 "cached" mode to the correct Meteorlake PAT index.

We can also replace the platform dependent cache mode to string code in
debugfs and elsewhere by the single implementation based on i915_cache_t.

v2:
  * Fix PAT-to-cache-mode table for PVC. (Fei)
  * Cache display caching mode too. (Fei)
  * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
  * Checkpath issues.
  * Cache mode flags check fixed.

v4:
  * Fix intel_device_info->cache_modes array size. (Matt)
  * Boolean cache mode and flags query. (Matt)
  * Reduce number of cache macros with some macro magic.
  * One more checkpatch fix.
  * Tweak tables to show legacy and Gen12 WB is fully coherent.

Signed-off-by: Tvrtko Ursulin 
References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/gem/i915_gem_domain.c|  60 +
  drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c|   3 +-
  drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_mman.c  |   4 +-
  drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
  drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|   2 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  20 +--
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
  .../drm/i915/gem/selftests/huge_gem_object.c  |   2 +-
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |   3 +-
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  10 +-
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  25 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
  drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
  drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
  drivers/gpu/drm/i915/i915_cache.c |  89 +++--
  drivers/gpu/drm/i915/i915_cache.h |  70 ++-
  drivers/gpu/drm/i915/i915_debugfs.c   |  53 ++--
  drivers/gpu/drm/i915/i915_driver.c|   4 +-
  drivers/gpu/drm/i915/i915_gem.c   |  13 --
  drivers/gpu/drm/i915/i915_pci.c   |  84 +++--
  drivers/gpu/drm/i915/i915_perf.c  |   2 +-
  drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
  .../gpu/drm/i915/selftests/i915_gem_evict.c   |   4 +-
  drivers/gpu/drm/i915/selftests/igt_spinner.c  |   2 +-
  .../gpu/drm/i915/selftests/mock_gem_device.c  |  14 +--
  36 files changed, 391 insertions(+), 367 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 57db9c581bf6..c15f83de33af 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -8,6 +8,7 @@
  #include "display/intel_frontbuffer.h"
  #include "gt/intel_gt.h"
  
+#include "i915_cache.h"

  #include "i915_drv.h"
  #include "i915_gem_clflush.h"
  #include "i915_gem_domain.h"
@@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
return false;
  
  	/*

-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, i915_gem_objec

Re: [RFC 3/8] drm/i915: Cache PAT index used by the driver

2023-07-28 Thread Tvrtko Ursulin



On 27/07/2023 23:44, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:54:59PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Eliminate a bunch of runtime calls to i915_gem_get_pat_index() by caching
the interesting PAT indices in struct drm_i915_private. They are static
per platfrom so no need to consult a function every time.

Signed-off-by: Tvrtko Ursulin 
Cc: Matt Roper 
Cc: Fei Yang 
---
  drivers/gpu/drm/i915/Makefile |  1 +
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  3 +--
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  7 ++---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 26 ---
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
  drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |  4 +--
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  4 +--
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  8 ++
  drivers/gpu/drm/i915/gt/intel_migrate.c   | 11 +++-
  drivers/gpu/drm/i915/gt/selftest_migrate.c|  9 +++
  drivers/gpu/drm/i915/gt/selftest_reset.c  | 14 +++---
  drivers/gpu/drm/i915/gt/selftest_tlb.c|  5 ++--
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |  8 ++
  drivers/gpu/drm/i915/i915_cache.c | 18 +
  drivers/gpu/drm/i915/i915_cache.h | 13 ++
  drivers/gpu/drm/i915/i915_driver.c|  3 +++
  drivers/gpu/drm/i915/i915_drv.h   |  2 ++
  drivers/gpu/drm/i915/i915_gem.c   |  8 ++
  drivers/gpu/drm/i915/i915_gpu_error.c |  8 ++
  drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +---
  .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +--
  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 11 +++-
  .../drm/i915/selftests/intel_memory_region.c  |  4 +--
  .../gpu/drm/i915/selftests/mock_gem_device.c  |  2 ++
  24 files changed, 89 insertions(+), 91 deletions(-)
  create mode 100644 drivers/gpu/drm/i915/i915_cache.c
  create mode 100644 drivers/gpu/drm/i915/i915_cache.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index c5fc91cd58e7..905a51a16588 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -35,6 +35,7 @@ subdir-ccflags-y += -I$(srctree)/$(src)
  # core driver code
  i915-y += i915_driver.o \
  i915_drm_client.o \
+ i915_cache.o \
  i915_config.o \
  i915_getparam.o \
  i915_ioctl.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5a687a3686bd..0a1d40220020 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1330,8 +1330,7 @@ static void *reloc_iomap(struct i915_vma *batch,
ggtt->vm.insert_page(>vm,
 i915_gem_object_get_dma_address(obj, page),
 offset,
-i915_gem_get_pat_index(ggtt->vm.i915,
-   I915_CACHE_NONE),
+eb->i915->pat_uc,
 0);
} else {
offset += page << PAGE_SHIFT;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index 5b0a5cf9a98a..1c8eb806b7d3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -563,11 +563,8 @@ static void dbg_poison(struct i915_ggtt *ggtt,
while (size) {
void __iomem *s;
  
-		ggtt->vm.insert_page(>vm, addr,

-ggtt->error_capture.start,
-i915_gem_get_pat_index(ggtt->vm.i915,
-   I915_CACHE_NONE),
-0);
+   ggtt->vm.insert_page(>vm, addr, ggtt->error_capture.start,
+ggtt->vm.i915->pat_uc, 0);
mb();
  
  		s = io_mapping_map_wc(>iomap,

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 7078af2f8f79..6bd6c239f4ac 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -58,6 +58,16 @@ i915_ttm_cache_level(struct drm_i915_private *i915, struct 
ttm_resource *res,
I915_CACHE_NONE;
  }
  
+static unsigned int

+i915_ttm_cache_pat(struct drm_i915_private *i915, struct ttm_resource *res,
+  struct ttm_tt *ttm)
+{
+   return ((HAS_LLC(i915) || HAS_SNOOP(i915)) &&
+   !i915_ttm_gtt_binds_lmem(res) &&


This matches the existing logic of i915_ttm_cache_level(), but do you
know why LMEM buffers are always set to uncached?  I don't understand
that part.


I am not sure - was thinking about that myself - li

Re: [PATCH 2/2] drm/v3d: Expose the total GPU usage stats on debugfs

2023-07-28 Thread Tvrtko Ursulin



On 28/07/2023 12:25, Maira Canal wrote:

Hi,

On 7/28/23 07:16, Tvrtko Ursulin wrote:


Hi,

On 27/07/2023 15:23, Maíra Canal wrote:

The previous patch exposed the accumulated amount of active time per
client for each V3D queue. But this doesn't provide a global notion of
the GPU usage.

Therefore, provide the accumulated amount of active time for each V3D
queue (BIN, RENDER, CSD, TFU and CACHE_CLEAN), considering all the jobs
submitted to the queue, independent of the client.

This data is exposed through the debugfs interface, so that if the
interface is queried at two different points of time the usage 
percentage

of each of the queues can be calculated.


Just passing observation - I've noticed a mismatch between fdinfo and 
debugfs in terms of ABI stability and production availability.


Not sure if it matters for your intended use cases, just saying that 
if you plan to have an user facing tool similar to what we have in 
intel_gpu_top, debugfs may not be the best choice.


Do you have a suggestion of a better interface that could be used to
expose this data?

It would be nice to have something generic, similar to fdinfo, to expose
global GPU stats. This way we could expose global GPU stats on gputop,
which would be great.


I think there is at least two options.

With i915 we use perf/PMU, drawback (or not, depends on the view point) 
is that it requires CAP_SYS_PERFMON. Fits well for exposing global GPU 
hardware counters.


You could go the sysfs route, which would be ABI stable and available in 
production. This could either be attempted to be somewhat DRM 
standardized (ala fdinfo), or driver specific.


Maybe someone has more ideas.

Regards,

Tvrtko



Best Regards,
- Maíra



Regards,

Tvrtko


Co-developed-by: Jose Maria Casanova Crespo 
Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Maíra Canal 
---
  drivers/gpu/drm/v3d/v3d_debugfs.c | 27 +++
  drivers/gpu/drm/v3d/v3d_drv.h |  3 +++
  drivers/gpu/drm/v3d/v3d_gem.c |  5 -
  drivers/gpu/drm/v3d/v3d_irq.c | 24 
  drivers/gpu/drm/v3d/v3d_sched.c   | 13 -
  5 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_debugfs.c 
b/drivers/gpu/drm/v3d/v3d_debugfs.c

index 330669f51fa7..3b7329343649 100644
--- a/drivers/gpu/drm/v3d/v3d_debugfs.c
+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
@@ -4,6 +4,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 

@@ -236,11 +237,37 @@ static int v3d_measure_clock(struct seq_file 
*m, void *unused)

  return 0;
  }

+static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused)
+{
+    struct drm_debugfs_entry *entry = m->private;
+    struct drm_device *dev = entry->dev;
+    struct v3d_dev *v3d = to_v3d_dev(dev);
+    enum v3d_queue queue;
+    u64 timestamp = local_clock();
+    u64 active_runtime;
+
+    seq_printf(m, "timestamp: %llu\n", timestamp);
+
+    for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
+    if (v3d->queue[queue].start_ns)
+    active_runtime = timestamp - v3d->queue[queue].start_ns;
+    else
+    active_runtime = 0;
+
+    seq_printf(m, "%s: %llu ns\n",
+   v3d_queue_to_string(queue),
+   v3d->queue[queue].enabled_ns + active_runtime);
+    }
+
+    return 0;
+}
+
  static const struct drm_debugfs_info v3d_debugfs_list[] = {
  {"v3d_ident", v3d_v3d_debugfs_ident, 0},
  {"v3d_regs", v3d_v3d_debugfs_regs, 0},
  {"measure_clock", v3d_measure_clock, 0},
  {"bo_stats", v3d_debugfs_bo_stats, 0},
+    {"gpu_usage", v3d_debugfs_gpu_usage, 0},
  };

  void
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h 
b/drivers/gpu/drm/v3d/v3d_drv.h

index ee5e12d0db1c..b41b32ecd991 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -38,6 +38,9 @@ struct v3d_queue_state {

  u64 fence_context;
  u64 emit_seqno;
+
+    u64 start_ns;
+    u64 enabled_ns;
  };

  /* Performance monitor object. The perform lifetime is controlled 
by userspace
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c 
b/drivers/gpu/drm/v3d/v3d_gem.c

index 40ed0c7c3fad..630ea2db8f8f 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -1014,8 +1014,11 @@ v3d_gem_init(struct drm_device *dev)
  u32 pt_size = 4096 * 1024;
  int ret, i;

-    for (i = 0; i < V3D_MAX_QUEUES; i++)
+    for (i = 0; i < V3D_MAX_QUEUES; i++) {
  v3d->queue[i].fence_context = dma_fence_context_alloc(1);
+    v3d->queue[i].start_ns = 0;
+    v3d->queue[i].enabled_ns = 0;
+    }

  spin_lock_init(>mm_lock);
  spin_lock_init(>job_lock);
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c 
b/drivers/gpu/drm/v3d/v3d_irq.c

index c898800ae9c2..be4ff7559309 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -102,9 +102,13 @@ v3d_irq(in

Re: [PATCH 2/2] drm/v3d: Expose the total GPU usage stats on debugfs

2023-07-28 Thread Tvrtko Ursulin



Hi,

On 27/07/2023 15:23, Maíra Canal wrote:

The previous patch exposed the accumulated amount of active time per
client for each V3D queue. But this doesn't provide a global notion of
the GPU usage.

Therefore, provide the accumulated amount of active time for each V3D
queue (BIN, RENDER, CSD, TFU and CACHE_CLEAN), considering all the jobs
submitted to the queue, independent of the client.

This data is exposed through the debugfs interface, so that if the
interface is queried at two different points of time the usage percentage
of each of the queues can be calculated.


Just passing observation - I've noticed a mismatch between fdinfo and 
debugfs in terms of ABI stability and production availability.


Not sure if it matters for your intended use cases, just saying that if 
you plan to have an user facing tool similar to what we have in 
intel_gpu_top, debugfs may not be the best choice.


Regards,

Tvrtko


Co-developed-by: Jose Maria Casanova Crespo 
Signed-off-by: Jose Maria Casanova Crespo 
Signed-off-by: Maíra Canal 
---
  drivers/gpu/drm/v3d/v3d_debugfs.c | 27 +++
  drivers/gpu/drm/v3d/v3d_drv.h |  3 +++
  drivers/gpu/drm/v3d/v3d_gem.c |  5 -
  drivers/gpu/drm/v3d/v3d_irq.c | 24 
  drivers/gpu/drm/v3d/v3d_sched.c   | 13 -
  5 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_debugfs.c 
b/drivers/gpu/drm/v3d/v3d_debugfs.c
index 330669f51fa7..3b7329343649 100644
--- a/drivers/gpu/drm/v3d/v3d_debugfs.c
+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c
@@ -4,6 +4,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 

@@ -236,11 +237,37 @@ static int v3d_measure_clock(struct seq_file *m, void 
*unused)
return 0;
  }

+static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused)
+{
+   struct drm_debugfs_entry *entry = m->private;
+   struct drm_device *dev = entry->dev;
+   struct v3d_dev *v3d = to_v3d_dev(dev);
+   enum v3d_queue queue;
+   u64 timestamp = local_clock();
+   u64 active_runtime;
+
+   seq_printf(m, "timestamp: %llu\n", timestamp);
+
+   for (queue = 0; queue < V3D_MAX_QUEUES; queue++) {
+   if (v3d->queue[queue].start_ns)
+   active_runtime = timestamp - v3d->queue[queue].start_ns;
+   else
+   active_runtime = 0;
+
+   seq_printf(m, "%s: %llu ns\n",
+  v3d_queue_to_string(queue),
+  v3d->queue[queue].enabled_ns + active_runtime);
+   }
+
+   return 0;
+}
+
  static const struct drm_debugfs_info v3d_debugfs_list[] = {
{"v3d_ident", v3d_v3d_debugfs_ident, 0},
{"v3d_regs", v3d_v3d_debugfs_regs, 0},
{"measure_clock", v3d_measure_clock, 0},
{"bo_stats", v3d_debugfs_bo_stats, 0},
+   {"gpu_usage", v3d_debugfs_gpu_usage, 0},
  };

  void
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index ee5e12d0db1c..b41b32ecd991 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -38,6 +38,9 @@ struct v3d_queue_state {

u64 fence_context;
u64 emit_seqno;
+
+   u64 start_ns;
+   u64 enabled_ns;
  };

  /* Performance monitor object. The perform lifetime is controlled by userspace
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 40ed0c7c3fad..630ea2db8f8f 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -1014,8 +1014,11 @@ v3d_gem_init(struct drm_device *dev)
u32 pt_size = 4096 * 1024;
int ret, i;

-   for (i = 0; i < V3D_MAX_QUEUES; i++)
+   for (i = 0; i < V3D_MAX_QUEUES; i++) {
v3d->queue[i].fence_context = dma_fence_context_alloc(1);
+   v3d->queue[i].start_ns = 0;
+   v3d->queue[i].enabled_ns = 0;
+   }

spin_lock_init(>mm_lock);
spin_lock_init(>job_lock);
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
index c898800ae9c2..be4ff7559309 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -102,9 +102,13 @@ v3d_irq(int irq, void *arg)
struct v3d_fence *fence =
to_v3d_fence(v3d->bin_job->base.irq_fence);
struct v3d_file_priv *file = 
v3d->bin_job->base.file->driver_priv;
+   u64 runtime = local_clock() - file->start_ns[V3D_BIN];

-   file->enabled_ns[V3D_BIN] += local_clock() - 
file->start_ns[V3D_BIN];
file->start_ns[V3D_BIN] = 0;
+   v3d->queue[V3D_BIN].start_ns = 0;
+
+   file->enabled_ns[V3D_BIN] += runtime;
+   v3d->queue[V3D_BIN].enabled_ns += runtime;

trace_v3d_bcl_irq(>drm, fence->seqno);
dma_fence_signal(>base);
@@ -115,9 +119,13 @@ v3d_irq(int irq, void *arg)
struct v3d_fence *fence =
   

Re: CPU overhead for drm fdinfo stats

2023-07-28 Thread Tvrtko Ursulin



On 27/07/2023 21:58, Alex Deucher wrote:

We have a number of customers using these stats, but the issue that
keeps coming up is the CPU overhead to gather them, particularly on
systems with hundreds of processes using the GPU.  Has anyone given
any thought to having a single interface to get this information for
the entire GPU in one place?


Could I have a framed told you so certificate please? :D

Well at least it depends on how much CPU overhead would your users be 
happy to eliminate and how much to keep. So maybe no need for that 
certificate just yet.


I was raising the issue of exponential complexity of walking "total 
number of processes" x "total number of file descriptors" on a system 
from the inception of fdinfo.


So for that issue the idea was to perhaps expose a list of pids with DRM 
fds open somewhere, maybe sysfs.


That would eliminate walking _all_ processes and trying to parse any 
their file descriptor.


But it would still require walking all file descriptors belonging to 
processes with DRM fds open.


If that wouldn't be enough of a saving for your users then no, I am not 
aware it was discussed. Assuming at least you were suggesting something 
like "read all fdinfo for all clients" in one blob. Also in sysfs? I 
think it would be doable by walking the dev->filelist and invoking 
drm_show_fdinfo() on them.


Out of curiosity are they using the fdinfo parsing code from IGT or 
something of their own?


Regards,

Tvrtko


Re: [Intel-gfx] [RFC 2/8] drm/i915: Split PTE encode between Gen12 and Meteorlake

2023-07-28 Thread Tvrtko Ursulin



On 27/07/2023 23:25, Matt Roper wrote:

On Thu, Jul 27, 2023 at 03:54:58PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

No need to run extra instructions which will never trigger on platforms
before Meteorlake.

Signed-off-by: Tvrtko Ursulin 
---
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 26 ++
  1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index c8568e5d1147..862ac1d2de25 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -63,6 +63,30 @@ static u64 gen12_pte_encode(dma_addr_t addr,
  {
gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
  
+	if (unlikely(flags & PTE_READ_ONLY))

+   pte &= ~GEN8_PAGE_RW;
+
+   if (flags & PTE_LM)
+   pte |= GEN12_PPGTT_PTE_LM;
+
+   if (pat_index & BIT(0))
+   pte |= GEN12_PPGTT_PTE_PAT0;
+
+   if (pat_index & BIT(1))
+   pte |= GEN12_PPGTT_PTE_PAT1;
+
+   if (pat_index & BIT(2))
+   pte |= GEN12_PPGTT_PTE_PAT2;
+
+   return pte;
+}
+
+static u64 mtl_pte_encode(dma_addr_t addr,
+ unsigned int pat_index,
+ u32 flags)
+{
+   gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
+


Would it be more readable to start with

 gen8_pte_t pte = gen12_pte_encode(addr, pat_index, flags);

and then |-in only the MTL-specific bit(s) as appropriate?


if (unlikely(flags & PTE_READ_ONLY))
pte &= ~GEN8_PAGE_RW;
  
@@ -995,6 +1019,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt,

 */
ppgtt->vm.alloc_scratch_dma = alloc_pt_dma;
  
+	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))

+   ppgtt->vm.pte_encode = mtl_pte_encode;
if (GRAPHICS_VER(gt->i915) >= 12)
ppgtt->vm.pte_encode = gen12_pte_encode;


I think you wanted 'else if' here.  Otherwise you clobber the MTL
function pointer.


Doh this was a strong fail.. Yes and yes.. I even had it like you 
suggest in that patch I mentioned to you earlier.. 
https://patchwork.freedesktop.org/patch/546013/?series=120341=2.


Do you have an opinion on that one perhaps?

Thanks,

Tvrtko


Re: [PATCH] drm/i915/gem: Add check for bitmap_zalloc()

2023-07-28 Thread Tvrtko Ursulin



Hi,

On 28/07/2023 02:58, Jiasheng Jiang wrote:

Add the check for the return value of bitmap_zalloc() in order to
guarantee the success of the allocation.

Fixes: e9b73c67390a ("drm/i915: Reduce memory pressure during shrinker by 
preallocating swizzle pages")
Signed-off-by: Jiasheng Jiang 
---
  drivers/gpu/drm/i915/gem/i915_gem_tiling.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c 
b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
index a049ca0b7980..e9cf99d95966 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c
@@ -311,6 +311,11 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj,
if (!obj->bit_17) {
obj->bit_17 = bitmap_zalloc(obj->base.size >> 
PAGE_SHIFT,
GFP_KERNEL);
+   if (!obj->bit_17) {
+   i915_gem_object_unlock(obj);
+   i915_gem_object_release_mmap_gtt(obj);
+   return -ENOMEM;
+   }


Hm the comment few lines above says:

/* Try to preallocate memory required to save swizzling on put-pages */

Lets emphasis the *try* for now. Then once the obj->bit_17 is attempted to be 
used we have this:

i915_gem_object_save_bit_17_swizzle(..)
{
...
if (obj->bit_17 == NULL) {
obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL);
if (obj->bit_17 == NULL) {
drm_err(obj->base.dev,
"Failed to allocate memory for bit 17 
record\n");
return;
}
}

So despite this area of the driver being a bit before my time, I'd say it quite 
possibly works as designed - only *tries* to preallocate but does not have to 
and can cope with a later failure.

Good question might be why wouldn't it be better to do what you suggest. Trade 
off would be between failing the ioctl and possibly crashing the application, 
versus visual corruption if at use time allocation fails.

The whole swizzling thing also only applies to old GPUs, stuff before 
Broadwell, which itself was released in 2014. So it is tempting to err on the 
side of caution and leave it as is. I'll mull it over in the background, or 
maybe someone else will have an opinion too.

Regards,

Tvrtko


}
} else {
bitmap_free(obj->bit_17);


Re: [PATCH 16/17] cgroup/drm: Expose memory stats

2023-07-27 Thread Tvrtko Ursulin



On 27/07/2023 12:54, Maarten Lankhorst wrote:

Hey,

On 2023-07-26 13:41, Tvrtko Ursulin wrote:


On 26/07/2023 11:14, Maarten Lankhorst wrote:

Hey,

On 2023-07-22 00:21, Tejun Heo wrote:

On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote:

   $ cat drm.memory.stat
   card0 region=system total=12898304 shared=0 active=0 
resident=12111872 purgeable=167936
   card0 region=stolen-system total=0 shared=0 active=0 resident=0 
purgeable=0


Data is generated on demand for simplicty of implementation ie. no 
running

totals are kept or accounted during migrations and such. Various
optimisations such as cheaper collection of data are possible but
deliberately left out for now.

Overall, the feature is deemed to be useful to container orchestration
software (and manual management).

Limits, either soft or hard, are not envisaged to be implemented on 
top of

this approach due on demand nature of collecting the stats.


So, yeah, if you want to add memory controls, we better think 
through how

the fd ownership migration should work.
I've taken a look at the series, since I have been working on cgroup 
memory eviction.


The scheduling stuff will work for i915, since it has a purely 
software execlist scheduler, but I don't think it will work for GuC 
(firmware) scheduling or other drivers that use the generic drm 
scheduler.


It actually works - I used to have a blurb in the cover letter about 
it but apparently I dropped it. Just a bit less well with many 
clients, since there are fewer priority levels.


All that the design requires from the invididual drivers is some way 
to react to the "you are over budget by this much" signal. The rest is 
driver and backend specific.


What I mean is that this signal may not be applicable since the drm 
scheduler just schedules jobs that run. Adding a weight might be done in 
hardware, since it's responsible for  scheduling which context gets to 
run. The over budget signal is useless in that case, and you just need 
to set a scheduling priority for the hardware instead.


The over budget callback lets the driver know its assigned budget and 
its current utilisation. Already with that data drivers could implement 
something smarter than what I did in my RFC. So I don't think callback 
is completely useless even for some smarter implementation which 
potentially ties into firmware scheduling.


Anyway, I maintain this is implementation details.

For something like this,  you would probably want it to work inside 
the drm scheduler first. Presumably, this can be done by setting a 
weight on each runqueue, and perhaps adding a callback to update one 
for a running queue. Calculating the weights hierarchically might be 
fun..


It is not needed to work in drm scheduler first. In fact drm scheduler 
based drivers can plug into what I have since it already has the 
notion of scheduling priorities.


They would only need to implement a hook which allow the cgroup 
controller to query client GPU utilisation and another to received the 
over budget signal.


Amdgpu and msm AFAIK could be easy candidates because they both 
support per client utilisation and priorities.


Looks like I need to put all this info back into the cover letter.

Also, hierarchic weights and time budgets are all already there. What 
could be done later is make this all smarter and respect the time 
budget with more precision. That would however, in many cases 
including Intel, require co-operation with the firmware. In any case 
it is only work in the implementation, while the cgroup control 
interface remains the same.


I have taken a look at how the rest of cgroup controllers change 
ownership when moved to a different cgroup, and the answer was: not 
at all. If we attempt to create the scheduler controls only on the 
first time the fd is used, you could probably get rid of all the 
tracking.


Can you send a CPU file descriptor from process A to process B and 
have CPU usage belonging to process B show up in process' A cgroup, or 
vice-versa? Nope, I am not making any sense, am I? My point being it 
is not like-to-like, model is different.


No ownership transfer would mean in wide deployments all GPU 
utilisation would be assigned to Xorg and so there is no point to any 
of this. No way to throttle a cgroup with un-important GPU clients for 
instance.
If you just grab the current process' cgroup when a drm_sched_entity is 
created, you don't have everything charged to X.org. No need for 
complicated ownership tracking in drm_file. The same equivalent should 
be done in i915 as well when a context is created as it's not using the 
drm scheduler.


Okay so essentially nuking the concept of DRM clients belongs to one 
cgroup and instead tracking at the context level. That is an interesting 
idea. I suspect implementation could require somewhat generalizing the 
concept of an "execution context", or at least expressing it via the DRM 
cgroup controller.


I 

Re: [PATCH 16/17] cgroup/drm: Expose memory stats

2023-07-27 Thread Tvrtko Ursulin



On 27/07/2023 14:42, Maarten Lankhorst wrote:

On 2023-07-26 21:44, Tejun Heo wrote:

Hello,

On Wed, Jul 26, 2023 at 12:14:24PM +0200, Maarten Lankhorst wrote:
So, yeah, if you want to add memory controls, we better think 
through how

the fd ownership migration should work.


I've taken a look at the series, since I have been working on cgroup 
memory

eviction.

The scheduling stuff will work for i915, since it has a purely software
execlist scheduler, but I don't think it will work for GuC (firmware)
scheduling or other drivers that use the generic drm scheduler.

For something like this,  you would probably want it to work inside 
the drm
scheduler first. Presumably, this can be done by setting a weight on 
each
runqueue, and perhaps adding a callback to update one for a running 
queue.

Calculating the weights hierarchically might be fun..


I don't have any idea on this front. The basic idea of making high level
distribution decisions in core code and letting individual drivers 
enforce

that in a way which fits them the best makes sense to me but I don't know
enough to have an opinion here.

I have taken a look at how the rest of cgroup controllers change 
ownership

when moved to a different cgroup, and the answer was: not at all. If we


For persistent resources, that's the general rule. Whoever instantiates a
resource gets to own it until the resource gets freed. There is an 
exception

with the pid controller and there are discussions around whether we want
some sort of migration behavior with memcg but yes by and large 
instantiator

being the owner is the general model cgroup follows.

attempt to create the scheduler controls only on the first time the 
fd is

used, you could probably get rid of all the tracking.
This can be done very easily with the drm scheduler.

WRT memory, I think the consensus is to track system memory like normal
memory. Stolen memory doesn't need to be tracked. It's kernel only 
memory,

used for internal bookkeeping  only.

The only time userspace can directly manipulate stolen memory, is by 
mapping
the pinned initial framebuffer to its own address space. The only 
allocation
it can do is when a framebuffer is displayed, and framebuffer 
compression

creates some stolen memory. Userspace is not
aware of this though, and has no way to manipulate those contents.


So, my dumb understanding:

* Ownership of an fd can be established on the first ioctl call and 
doesn't

   need to be migrated afterwards. There are no persistent resources to
   migration on the first call.


Yes, keyword is "can". Trouble is migration may or may not happen.

One may choose "Plasma X.org" session type in your login manager and all 
DRM fds would be under Xorg if not migrated. Or one may choose "Plasma 
Wayland" and migration wouldn't matter. But former is I think has a huge 
deployed base so that not supporting implicit migration would be a 
significant asterisk next to the controller.


* Memory then can be tracked in a similar way to memcg. Memory gets 
charged

   to the initial instantiator and doesn't need to be moved around
   afterwards. There may be some discrepancies around stolen memory 
but the
   magnitude of inaccuracy introduced that way is limited and bound 
and can

   be safely ignored.

Is that correct?


Hey,

Yeah mostly, I think we can stop tracking stolen memory. I stopped doing 
that for Xe, there is literally nothing to control for userspace in there.


Right, but for reporting stolen is a red-herring. In this RFC I simply 
report on all memory regions known by the driver. As I said in the other 
reply, imagine the keys are 'system' and 'vram0'. Point was just to 
illustrate multiplicity of regions.


Regards,

Tvrtko


[RFC 8/8] drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, we can
refine the check in i915_gem_object_can_bypass_llc() to stop assuming any
user PAT can bypass the shared cache (if there is any).

Instead we can use the absence of I915_BO_CACHE_COHERENT_FOR_WRITE as the
criteria, which is set for all caching modes where writes from the CPU
side (in this case buffer clears before handing buffers over to userspace)
are fully coherent with respect to reads from the GPU.

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index ec1f0be43d0d..8c4b54bd3911 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -221,12 +221,6 @@ bool i915_gem_object_can_bypass_llc(struct 
drm_i915_gem_object *obj)
if (!(obj->flags & I915_BO_ALLOC_USER))
return false;
 
-   /*
-* Always flush cache for UMD objects at creation time.
-*/
-   if (obj->pat_set_by_user)
-   return true;
-
/*
 * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it
 * possible for userspace to bypass the GTT caching bits set by the
@@ -239,7 +233,17 @@ bool i915_gem_object_can_bypass_llc(struct 
drm_i915_gem_object *obj)
 * it, but since i915 takes the stance of always zeroing memory before
 * handing it to userspace, we need to prevent this.
 */
-   return IS_JSL_EHL(i915);
+   if (IS_JSL_EHL(i915))
+   return true;
+
+   /*
+* Any caching mode where writes via CPU cache are not coherent with
+* the GPU needs explicit flushing to ensure GPU can not see stale data.
+*/
+   if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
+   return true;
+
+   return false;
 }
 
 static void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file 
*file)
-- 
2.39.2



[RFC 7/8] drm/i915: Lift the user PAT restriction from use_cpu_reloc

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, we can
refine the check in use_cpu_reloc() to not reject the uncached PAT if it
was set by userspace.

Instead it can decide based on the presence of full coherency which
should be functionally equivalent on legacy platforms. We can ignore WT
since it is only used by the display, and we can ignore Meteorlake since
it will fail on the existing "has_llc" condition before the object cache
mode check.

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 9d6e49c8a4c6..f74b33670bad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -640,16 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache 
*cache,
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
return false;
 
-   /*
-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, i915_gem_object_has_cache_level() always
-* return true, otherwise the call would fall back to checking whether
-* the object is un-cached.
-*/
return (cache->has_llc ||
obj->cache_dirty ||
-   !(obj->pat_set_by_user ||
- i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)));
+   i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W));
 }
 
 static int eb_reserve_vma(struct i915_execbuffer *eb,
-- 
2.39.2



[RFC 6/8] drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, and having
also special cased the Meteorlake snooping fully coherent mode, we can
remove the user PAT check from gpu_write_needs_clflush().

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index c15f83de33af..bf3a2fa0e539 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -41,12 +41,6 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
if (IS_METEORLAKE(i915))
return false;
 
-   /*
-* Always flush cache for UMD objects with PAT index set.
-*/
-   if (obj->pat_set_by_user)
-   return true;
-
/*
 * Fully coherent cached access may end up with data in the CPU cache
 * which hasn't hit memory yet.
-- 
2.39.2



[RFC 5/8] drm/i915: Improve the vm_fault_gtt user PAT index restriction

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Now that i915 understands the caching modes behind PAT indices, we can
refine the check in vm_fault_gtt() to not reject the uncached PAT if it
was set by userspace on a snoopable platform.

Signed-off-by: Tvrtko Ursulin 
Cc: Fei Yang 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index cd7f8ded0d6f..9aa6ecf68432 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -382,17 +382,9 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
goto err_reset;
}
 
-   /*
-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, coherency is managed by userspace, make
-* sure we don't fail handling the vm fault by calling
-* i915_gem_object_has_cache_level() which always return true for such
-* objects. Otherwise this helper function would fall back to checking
-* whether the object is un-cached.
-*/
-   if (!((obj->pat_set_by_user ||
-  i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)) ||
- HAS_LLC(i915))) {
+   /* Access to snoopable pages through the GTT is incoherent. */
+   if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) &&
+   !HAS_LLC(i915)) {
ret = -EFAULT;
goto err_unpin;
}
-- 
2.39.2



[RFC 4/8] drm/i915: Refactor PAT/object cache handling

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few sub-
optimal design decisions which this patch tries to improve upon.

Principal change here is to invert the per platform cache level to PAT
index table which was added by the referenced commit, and by doing so
enable i915 to understand the cache mode between PAT indices, changing
them from opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level and make the involved
code path clearer.

To achieve this we replace the enum i915_cache_level with i915_cache_t,
composed of a more detailed representation of each cache mode (base mode
plus flags).

In this way we are able to express the differences between different
write-back mode coherency settings on Meteorlake, which in turn enables us
to map the i915 "cached" mode to the correct Meteorlake PAT index.

We can also replace the platform dependent cache mode to string code in
debugfs and elsewhere by the single implementation based on i915_cache_t.

v2:
 * Fix PAT-to-cache-mode table for PVC. (Fei)
 * Cache display caching mode too. (Fei)
 * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
 * Checkpath issues.
 * Cache mode flags check fixed.

v4:
 * Fix intel_device_info->cache_modes array size. (Matt)
 * Boolean cache mode and flags query. (Matt)
 * Reduce number of cache macros with some macro magic.
 * One more checkpatch fix.
 * Tweak tables to show legacy and Gen12 WB is fully coherent.

Signed-off-by: Tvrtko Ursulin 
References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c|  60 +
 drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|   3 +-
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |   4 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  20 +--
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
 .../drm/i915/gem/selftests/huge_gem_object.c  |   2 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |   3 +-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  10 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  25 ++--
 drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
 .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
 drivers/gpu/drm/i915/i915_cache.c |  89 +++--
 drivers/gpu/drm/i915/i915_cache.h |  70 ++-
 drivers/gpu/drm/i915/i915_debugfs.c   |  53 ++--
 drivers/gpu/drm/i915/i915_driver.c|   4 +-
 drivers/gpu/drm/i915/i915_gem.c   |  13 --
 drivers/gpu/drm/i915/i915_pci.c   |  84 +++--
 drivers/gpu/drm/i915/i915_perf.c  |   2 +-
 drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |   4 +-
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |   2 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  14 +--
 36 files changed, 391 insertions(+), 367 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index 57db9c581bf6..c15f83de33af 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -8,6 +8,7 @@
 #include "display/intel_frontbuffer.h"
 #include "gt/intel_gt.h"
 
+#include "i915_cache.h"
 #include "i915_drv.h"
 #include "i915_gem_clflush.h"
 #include "i915_gem_domain.h"
@@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
return false;
 
/*
-* For objects created by userspace through GEM_CREATE with pat_index
-* set by set_pat extension, i915_gem_object_has_cache_level() will
-* always return true, because the coherency of such object is managed
-* by userspace. Othereise the call here would fal

[RFC 1/8] drm/i915: Skip clflush after GPU writes on Meteorlake

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

On Meteorlake CPU cache will not contain stale data after GPU access since
write-invalidate protocol is used, which means there is no need to flush
before potentially transitioning the buffer to a non-coherent domain.

Use the opportunity to documet the situation on discrete too.

Signed-off-by: Tvrtko Ursulin 
Cc: Matt Roper 
Cc: Fei Yang 
Cc: Matthew Auld 
Cc: Thomas Hellström 
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index ffddec1d2a76..57db9c581bf6 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -24,9 +24,22 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)
 {
struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+   /*
+* Discrete GPUs never dirty the CPU cache.
+*/
if (IS_DGFX(i915))
return false;
 
+   /*
+* Cache snooping on Meteorlake is using write-invalidate so GPU writes
+* never end up in the CPU cache.
+*
+* QQQ: Do other snooping platforms behave identicaly and could we
+*  therefore write this as "if !HAS_LLC(i915) && HAS_SNOOP(i915)"?
+*/
+   if (IS_METEORLAKE(i915))
+   return false;
+
/*
 * For objects created by userspace through GEM_CREATE with pat_index
 * set by set_pat extension, i915_gem_object_has_cache_level() will
-- 
2.39.2



[RFC 2/8] drm/i915: Split PTE encode between Gen12 and Meteorlake

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

No need to run extra instructions which will never trigger on platforms
before Meteorlake.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index c8568e5d1147..862ac1d2de25 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -63,6 +63,30 @@ static u64 gen12_pte_encode(dma_addr_t addr,
 {
gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
 
+   if (unlikely(flags & PTE_READ_ONLY))
+   pte &= ~GEN8_PAGE_RW;
+
+   if (flags & PTE_LM)
+   pte |= GEN12_PPGTT_PTE_LM;
+
+   if (pat_index & BIT(0))
+   pte |= GEN12_PPGTT_PTE_PAT0;
+
+   if (pat_index & BIT(1))
+   pte |= GEN12_PPGTT_PTE_PAT1;
+
+   if (pat_index & BIT(2))
+   pte |= GEN12_PPGTT_PTE_PAT2;
+
+   return pte;
+}
+
+static u64 mtl_pte_encode(dma_addr_t addr,
+ unsigned int pat_index,
+ u32 flags)
+{
+   gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
+
if (unlikely(flags & PTE_READ_ONLY))
pte &= ~GEN8_PAGE_RW;
 
@@ -995,6 +1019,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt,
 */
ppgtt->vm.alloc_scratch_dma = alloc_pt_dma;
 
+   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
+   ppgtt->vm.pte_encode = mtl_pte_encode;
if (GRAPHICS_VER(gt->i915) >= 12)
ppgtt->vm.pte_encode = gen12_pte_encode;
else
-- 
2.39.2



[RFC 3/8] drm/i915: Cache PAT index used by the driver

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Eliminate a bunch of runtime calls to i915_gem_get_pat_index() by caching
the interesting PAT indices in struct drm_i915_private. They are static
per platfrom so no need to consult a function every time.

Signed-off-by: Tvrtko Ursulin 
Cc: Matt Roper 
Cc: Fei Yang 
---
 drivers/gpu/drm/i915/Makefile |  1 +
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  3 +--
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  7 ++---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 26 ---
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |  4 +--
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  4 +--
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  8 ++
 drivers/gpu/drm/i915/gt/intel_migrate.c   | 11 +++-
 drivers/gpu/drm/i915/gt/selftest_migrate.c|  9 +++
 drivers/gpu/drm/i915/gt/selftest_reset.c  | 14 +++---
 drivers/gpu/drm/i915/gt/selftest_tlb.c|  5 ++--
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |  8 ++
 drivers/gpu/drm/i915/i915_cache.c | 18 +
 drivers/gpu/drm/i915/i915_cache.h | 13 ++
 drivers/gpu/drm/i915/i915_driver.c|  3 +++
 drivers/gpu/drm/i915/i915_drv.h   |  2 ++
 drivers/gpu/drm/i915/i915_gem.c   |  8 ++
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 ++
 drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +---
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +--
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 11 +++-
 .../drm/i915/selftests/intel_memory_region.c  |  4 +--
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  2 ++
 24 files changed, 89 insertions(+), 91 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_cache.c
 create mode 100644 drivers/gpu/drm/i915/i915_cache.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index c5fc91cd58e7..905a51a16588 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -35,6 +35,7 @@ subdir-ccflags-y += -I$(srctree)/$(src)
 # core driver code
 i915-y += i915_driver.o \
  i915_drm_client.o \
+ i915_cache.o \
  i915_config.o \
  i915_getparam.o \
  i915_ioctl.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5a687a3686bd..0a1d40220020 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1330,8 +1330,7 @@ static void *reloc_iomap(struct i915_vma *batch,
ggtt->vm.insert_page(>vm,
 i915_gem_object_get_dma_address(obj, page),
 offset,
-i915_gem_get_pat_index(ggtt->vm.i915,
-   I915_CACHE_NONE),
+eb->i915->pat_uc,
 0);
} else {
offset += page << PAGE_SHIFT;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c 
b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
index 5b0a5cf9a98a..1c8eb806b7d3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c
@@ -563,11 +563,8 @@ static void dbg_poison(struct i915_ggtt *ggtt,
while (size) {
void __iomem *s;
 
-   ggtt->vm.insert_page(>vm, addr,
-ggtt->error_capture.start,
-i915_gem_get_pat_index(ggtt->vm.i915,
-   I915_CACHE_NONE),
-0);
+   ggtt->vm.insert_page(>vm, addr, ggtt->error_capture.start,
+ggtt->vm.i915->pat_uc, 0);
mb();
 
s = io_mapping_map_wc(>iomap,
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 7078af2f8f79..6bd6c239f4ac 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -58,6 +58,16 @@ i915_ttm_cache_level(struct drm_i915_private *i915, struct 
ttm_resource *res,
I915_CACHE_NONE;
 }
 
+static unsigned int
+i915_ttm_cache_pat(struct drm_i915_private *i915, struct ttm_resource *res,
+  struct ttm_tt *ttm)
+{
+   return ((HAS_LLC(i915) || HAS_SNOOP(i915)) &&
+   !i915_ttm_gtt_binds_lmem(res) &&
+   ttm->caching == ttm_cached) ? i915->pat_wb :
+   i915->pat_uc;
+}
+
 static struct intel_memory_region *
 i915_ttm_region(struct ttm_device *bdev, int ttm_mem_type)
 {
@@ -196,7 +206,7 @@ static struct dma_fence *i915_ttm_accel_move(struct 
ttm_buffer_object *bo,
struct drm

[RFC 0/8] Another take on PAT/object cache mode refactoring

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Good news is that I realized series can be split after all. Bad news is that it
is still a lot to go through.

  drm/i915: Skip clflush after GPU writes on Meteorlake

This is based on what Fei found out from hardware architects. If we agree the
the function this helper should achieve follow up is checking if other snoopable
platforms are the same.

  drm/i915: Split PTE encode between Gen12 and Meteorlake

Not that much related but I feel we don't need to run impossible code on
platforms before Meteorlake. Shouldn't be controversial.

  drm/i915: Cache PAT index used by the driver

This one shouldn't be controversial either. Just eliminates a pile of calls to
i915_gem_get_pat_index().

  drm/i915: Refactor PAT/object cache handling

This is most code and the "table reversal" logic which makes i915 understands
caching modes behind PAT indices.

Review for taste and general "does it make sense" is needed here. Oh and extra
care about boolean logic conversion as I was pulling out obj->user_pat_set from
inside i915_gem_object_has_cache_level to the call sites.

All magic "if user PAT is set assume the worst" are still left in with this
patch.

  drm/i915: Improve the vm_fault_gtt user PAT index restriction
  drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush
  drm/i915: Lift the user PAT restriction from use_cpu_reloc
  drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc

This bunch is what removes the "user PAT set special casing".

Each of them probably have different reasons why the original cache level check
was in them so as many extra pair of eyes as possible are needed to verify both
that I have correctly understood what the underlying reasons why each were
there, and that I haven't fumbled the logic on the rudimentary level. Or perhaps
that it is possible to simplify this further. By maybe using more of
I915_BO_CACHE_COHERENT_FOR_... flags, or something.

Overall, a lot of scrutiny is needed for most of the series since it is
complicated and I am juggling multiple things.

Cc: Fei Yang 
Cc: Matt Roper 

Tvrtko Ursulin (8):
  drm/i915: Skip clflush after GPU writes on Meteorlake
  drm/i915: Split PTE encode between Gen12 and Meteorlake
  drm/i915: Cache PAT index used by the driver
  drm/i915: Refactor PAT/object cache handling
  drm/i915: Improve the vm_fault_gtt user PAT index restriction
  drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush
  drm/i915: Lift the user PAT restriction from use_cpu_reloc
  drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc

 drivers/gpu/drm/i915/Makefile |   1 +
 drivers/gpu/drm/i915/gem/i915_gem_domain.c|  67 ++---
 drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  11 +-
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  12 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 135 ++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +--
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|   9 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  46 +++---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
 .../drm/i915/gem/selftests/huge_gem_object.c  |   2 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |   5 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |   4 +-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  40 --
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++---
 drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
 drivers/gpu/drm/i915/gt/intel_migrate.c   |  11 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c|   9 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  14 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|   5 +-
 .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |   8 +-
 drivers/gpu/drm/i915/i915_cache.c |  93 
 drivers/gpu/drm/i915/i915_cache.h |  81 +++
 drivers/gpu/drm/i915/i915_debugfs.c   |  53 +--
 drivers/gpu/drm/i915/i915_driver.c|   5 +
 drivers/gpu/drm/i915/i915_drv.h   |   2 +
 drivers/gpu/drm/i915/i915_gem.c   |  21 +--
 drivers/gpu/drm/i915/i915_gpu_error.c |   8 +-
 drivers/gpu/drm/i915/i915_pci.c   |  84 ++-
 drivers/gpu/drm/i915/i915_perf.c 

[PATCH 5/5] drm/i915: Implement fdinfo memory stats printing

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Use the newly added drm_print_memory_stats helper to show memory
utilisation of our objects in drm/driver specific fdinfo output.

To collect the stats we walk the per memory regions object lists
and accumulate object size into the respective drm_memory_stats
categories.

Objects with multiple possible placements are reported in multiple
regions for total and shared sizes, while other categories are
counted only for the currently active region.

Signed-off-by: Tvrtko Ursulin 
Cc: Aravind Iddamsetty 
Cc: Rob Clark 
---
 drivers/gpu/drm/i915/i915_drm_client.c | 85 ++
 1 file changed, 85 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index a61356012df8..9e7a6075ee25 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref)
 }
 
 #ifdef CONFIG_PROC_FS
+static void
+obj_meminfo(struct drm_i915_gem_object *obj,
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN])
+{
+   struct intel_memory_region *mr;
+   u64 sz = obj->base.size;
+   enum intel_region_id id;
+   unsigned int i;
+
+   /* Attribute size and shared to all possible memory regions. */
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   mr = obj->mm.placements[i];
+   id = mr->id;
+
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   /* Attribute other categories to only the current region. */
+   mr = obj->mm.region;
+   if (mr)
+   id = mr->id;
+   else
+   id = INTEL_REGION_SMEM;
+
+   if (!obj->mm.n_placements) {
+   if (obj->base.handle_count > 1)
+   stats[id].shared += sz;
+   else
+   stats[id].private += sz;
+   }
+
+   if (i915_gem_object_has_pages(obj)) {
+   stats[id].resident += sz;
+
+   if (!dma_resv_test_signaled(obj->base.resv,
+   dma_resv_usage_rw(true)))
+   stats[id].active += sz;
+   else if (i915_gem_object_is_shrinkable(obj) &&
+obj->mm.madv == I915_MADV_DONTNEED)
+   stats[id].purgeable += sz;
+   }
+}
+
+static void show_meminfo(struct drm_printer *p, struct drm_file *file)
+{
+   struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {};
+   struct drm_i915_file_private *fpriv = file->driver_priv;
+   struct i915_drm_client *client = fpriv->client;
+   struct drm_i915_private *i915 = fpriv->i915;
+   struct drm_i915_gem_object *obj;
+   struct intel_memory_region *mr;
+   struct list_head *pos;
+   unsigned int id;
+
+   /* Public objects. */
+   spin_lock(>table_lock);
+   idr_for_each_entry(>object_idr, obj, id)
+   obj_meminfo(obj, stats);
+   spin_unlock(>table_lock);
+
+   /* Internal objects. */
+   rcu_read_lock();
+   list_for_each_rcu(pos, >objects_list) {
+   obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj),
+client_link));
+   if (!obj)
+   continue;
+   obj_meminfo(obj, stats);
+   i915_gem_object_put(obj);
+   }
+   rcu_read_unlock();
+
+   for_each_memory_region(mr, i915, id)
+   drm_print_memory_stats(p,
+  [id],
+  DRM_GEM_OBJECT_RESIDENT |
+  DRM_GEM_OBJECT_PURGEABLE,
+  mr->name);
+}
+
 static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_RENDER] = "render",
[I915_ENGINE_CLASS_COPY] = "copy",
@@ -106,6 +189,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
 * **
 */
 
+   show_meminfo(p, file);
+
if (GRAPHICS_VER(i915) < 8)
return;
 
-- 
2.39.2



[PATCH v6 0/5] fdinfo memory stats

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

A short series to enable fdinfo memory stats for i915.

I added tracking of most classes of objects (user objects, page tables, context
state, ring buffers) which contribute to client's memory footprint and am
accouting their memory use along the similar lines as in Rob's msm code, just
that with i915 specific code we can show a memory region breakdown and so
support discrete and multi-tile GPUs properly. And also reflect that our objects
can have multiple allowed backing stores.

The existing helper Rob added is then used to dump the per memory region stats
to fdinfo.

The basic objects-per-client infrastructure can later be extended to cover all
objects and so avoid needing to walk the IDR under the client's file table lock,
which would further avoid distburbing the running clients by parallel fdinfo
readers.

Example fdinfo format:

# cat /proc/1383/fdinfo/8
pos:0
flags:  0212
mnt_id: 21
ino:397
drm-driver: i915
drm-client-id:  18
drm-pdev:   :00:02.0
drm-total-system:   125 MiB
drm-shared-system:  16 MiB
drm-active-system:  110 MiB
drm-resident-system:125 MiB
drm-purgeable-system:   2 MiB
drm-total-stolen-system:0
drm-shared-stolen-system:   0
drm-active-stolen-system:   0
drm-resident-stolen-system: 0
drm-purgeable-stolen-system:0
drm-engine-render:  25662044495 ns
drm-engine-copy:0 ns
drm-engine-video:   0 ns
drm-engine-video-enhance:   0 ns

Example gputop output:

DRM minor 0
 PID SMEM  SMEMRSS   render copy videoNAME
1233 124M 124M |||||||| neverball
1130  59M  59M |█▌  ||||||| Xorg
1207  12M  12M |||||||| xfwm4

Or with Wayland:

DRM minor 0
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2093 191M 191M |▊  ||   ||   ||   | 
gnome-shell
DRM minor 128
 PID  MEM  RSSrendercopy videovideo-enhance NAME
2551  71M  71M |██▉||   ||   ||   | 
neverball
2553  50M  50M |   ||   ||   ||   | 
Xwayland

v2:
 * Now actually per client.

v3:
 * Track imported dma-buf objects.

v4:
 * Rely on DRM GEM handles for tracking user objects.
 * Fix internal object accounting (no placements).

v5:
 * Fixed brain fart of overwriting the loop cursor.
 * Fixed object destruction racing with fdinfo reads.
 * Take reference to GEM context while using it.

v6:
 * Rebase, cover letter update.

Tvrtko Ursulin (5):
  drm/i915: Add ability for tracking buffer objects per client
  drm/i915: Record which client owns a VM
  drm/i915: Track page table backing store usage
  drm/i915: Account ring buffer and context state storage
  drm/i915: Implement fdinfo memory stats printing

 drivers/gpu/drm/i915/gem/i915_gem_context.c   |  11 +-
 .../gpu/drm/i915/gem/i915_gem_context_types.h |   3 +
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  13 +-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  12 ++
 .../gpu/drm/i915/gem/selftests/mock_context.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_context.c   |  14 ++
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   6 +
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   1 +
 drivers/gpu/drm/i915/i915_drm_client.c| 131 ++
 drivers/gpu/drm/i915/i915_drm_client.h|  41 ++
 10 files changed, 228 insertions(+), 8 deletions(-)

-- 
2.39.2



[PATCH 1/5] drm/i915: Add ability for tracking buffer objects per client

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

In order to show per client memory usage lets add some infrastructure
which enables tracking buffer objects owned by clients.

We add a per client list protected by a new per client lock and to support
delayed destruction (post client exit) we make tracked objects hold
references to the owning client.

Also, object memory region teardown is moved to the existing RCU free
callback to allow safe dereference from the fdinfo RCU read section.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +--
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 12 +++
 drivers/gpu/drm/i915/i915_drm_client.c| 36 +++
 drivers/gpu/drm/i915/i915_drm_client.h| 32 +
 4 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index 97ac6fb37958..3dc4fbb67d2b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -105,6 +105,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
INIT_LIST_HEAD(>mm.link);
 
+#ifdef CONFIG_PROC_FS
+   INIT_LIST_HEAD(>client_link);
+#endif
+
INIT_LIST_HEAD(>lut_list);
spin_lock_init(>lut_lock);
 
@@ -292,6 +296,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head)
container_of(head, typeof(*obj), rcu);
struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
+   /* We need to keep this alive for RCU read access from fdinfo. */
+   if (obj->mm.n_placements > 1)
+   kfree(obj->mm.placements);
+
i915_gem_object_free(obj);
 
GEM_BUG_ON(!atomic_read(>mm.free_count));
@@ -388,9 +396,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj)
if (obj->ops->release)
obj->ops->release(obj);
 
-   if (obj->mm.n_placements > 1)
-   kfree(obj->mm.placements);
-
if (obj->shares_resv_from)
i915_vm_resv_put(obj->shares_resv_from);
 
@@ -441,6 +446,8 @@ static void i915_gem_free_object(struct drm_gem_object 
*gem_obj)
 
GEM_BUG_ON(i915_gem_object_is_framebuffer(obj));
 
+   i915_drm_client_remove_object(obj);
+
/*
 * Before we free the object, make sure any pure RCU-only
 * read-side critical sections are complete, e.g.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index e72c57716bee..8de2b91b3edf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -300,6 +300,18 @@ struct drm_i915_gem_object {
 */
struct i915_address_space *shares_resv_from;
 
+#ifdef CONFIG_PROC_FS
+   /**
+* @client: @i915_drm_client which created the object
+*/
+   struct i915_drm_client *client;
+
+   /**
+* @client_link: Link into @i915_drm_client.objects_list
+*/
+   struct list_head client_link;
+#endif
+
union {
struct rcu_head rcu;
struct llist_node freed;
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2a44b3876cb5..2e5e69edc0f9 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void)
kref_init(>kref);
spin_lock_init(>ctx_lock);
INIT_LIST_HEAD(>ctx_list);
+#ifdef CONFIG_PROC_FS
+   spin_lock_init(>objects_lock);
+   INIT_LIST_HEAD(>objects_list);
+#endif
 
return client;
 }
@@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file)
for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++)
show_client_class(p, i915, file_priv->client, i);
 }
+
+void i915_drm_client_add_object(struct i915_drm_client *client,
+   struct drm_i915_gem_object *obj)
+{
+   unsigned long flags;
+
+   GEM_WARN_ON(obj->client);
+   GEM_WARN_ON(!list_empty(>client_link));
+
+   spin_lock_irqsave(>objects_lock, flags);
+   obj->client = i915_drm_client_get(client);
+   list_add_tail_rcu(>client_link, >objects_list);
+   spin_unlock_irqrestore(>objects_lock, flags);
+}
+
+bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj)
+{
+   struct i915_drm_client *client = fetch_and_zero(>client);
+   unsigned long flags;
+
+   /* Object may not be associated with a client. */
+   if (!client)
+   return false;
+
+   spin_lock_irqsave(>objects_lock, flags);
+   list_del_rcu(>client_link);
+   spin_unlock_irqrestore(>objects_lock, flags);
+
+   i915_drm_client_put(client);
+
+   return true;
+}

[PATCH 4/5] drm/i915: Account ring buffer and context state storage

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account ring buffers and logical context space against the owning client
memory usage stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_context.c | 14 ++
 drivers/gpu/drm/i915/i915_drm_client.c  | 10 ++
 drivers/gpu/drm/i915/i915_drm_client.h  |  9 +
 3 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index a53b26178f0a..a2f1245741bb 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -6,6 +6,7 @@
 #include "gem/i915_gem_context.h"
 #include "gem/i915_gem_pm.h"
 
+#include "i915_drm_client.h"
 #include "i915_drv.h"
 #include "i915_trace.h"
 
@@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine)
 
 int intel_context_alloc_state(struct intel_context *ce)
 {
+   struct i915_gem_context *ctx;
int err = 0;
 
if (mutex_lock_interruptible(>pin_mutex))
@@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce)
goto unlock;
 
set_bit(CONTEXT_ALLOC_BIT, >flags);
+
+   rcu_read_lock();
+   ctx = rcu_dereference(ce->gem_context);
+   if (ctx && !kref_get_unless_zero(>ref))
+   ctx = NULL;
+   rcu_read_unlock();
+   if (ctx) {
+   if (ctx->client)
+   i915_drm_client_add_context_objects(ctx->client,
+   ce);
+   i915_gem_context_put(ctx);
+   }
}
 
 unlock:
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 2e5e69edc0f9..a61356012df8 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct 
drm_i915_gem_object *obj)
 
return true;
 }
+
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce)
+{
+   if (ce->state)
+   i915_drm_client_add_object(client, ce->state->obj);
+
+   if (ce->ring != ce->engine->legacy.ring && ce->ring->vma)
+   i915_drm_client_add_object(client, ce->ring->vma->obj);
+}
 #endif
diff --git a/drivers/gpu/drm/i915/i915_drm_client.h 
b/drivers/gpu/drm/i915/i915_drm_client.h
index 5f58fdf7dcb8..69cedfcd3d69 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.h
+++ b/drivers/gpu/drm/i915/i915_drm_client.h
@@ -14,6 +14,7 @@
 
 #include "i915_file_private.h"
 #include "gem/i915_gem_object_types.h"
+#include "gt/intel_context_types.h"
 
 #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE
 
@@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct 
drm_file *file);
 void i915_drm_client_add_object(struct i915_drm_client *client,
struct drm_i915_gem_object *obj);
 bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj);
+void i915_drm_client_add_context_objects(struct i915_drm_client *client,
+struct intel_context *ce);
 #else
 static inline void i915_drm_client_add_object(struct i915_drm_client *client,
  struct drm_i915_gem_object *obj)
@@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct 
i915_drm_client *client,
 static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object 
*obj)
 {
 }
+
+static inline void
+i915_drm_client_add_context_objects(struct i915_drm_client *client,
+   struct intel_context *ce)
+{
+}
 #endif
 
 #endif /* !__I915_DRM_CLIENT_H__ */
-- 
2.39.2



[PATCH 3/5] drm/i915: Track page table backing store usage

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Account page table backing store against the owning client memory usage
stats.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index 731d9f2bbc56..065099362a98 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
@@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct 
i915_address_space *vm, int sz)
if (!IS_ERR(obj)) {
obj->base.resv = i915_vm_resv_get(vm);
obj->shares_resv_from = vm;
+
+   if (vm->fpriv)
+   i915_drm_client_add_object(vm->fpriv->client, obj);
}
 
return obj;
-- 
2.39.2



[PATCH 2/5] drm/i915: Record which client owns a VM

2023-07-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

To enable accounting of indirect client memory usage (such as page tables)
in the following patch, lets start recording the creator of each PPGTT.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Aravind Iddamsetty 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 11 ---
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h |  3 +++
 drivers/gpu/drm/i915/gem/selftests/mock_context.c |  4 ++--
 drivers/gpu/drm/i915/gt/intel_gtt.h   |  1 +
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 9a9ff84c90d7..35cf6608180e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -279,7 +279,8 @@ static int proto_context_set_protected(struct 
drm_i915_private *i915,
 }
 
 static struct i915_gem_proto_context *
-proto_context_create(struct drm_i915_private *i915, unsigned int flags)
+proto_context_create(struct drm_i915_file_private *fpriv,
+struct drm_i915_private *i915, unsigned int flags)
 {
struct i915_gem_proto_context *pc, *err;
 
@@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, 
unsigned int flags)
if (!pc)
return ERR_PTR(-ENOMEM);
 
+   pc->fpriv = fpriv;
pc->num_user_engines = -1;
pc->user_engines = NULL;
pc->user_flags = BIT(UCONTEXT_BANNABLE) |
@@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915,
err = PTR_ERR(ppgtt);
goto err_ctx;
}
+   ppgtt->vm.fpriv = pc->fpriv;
vm = >vm;
}
if (vm)
@@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
/* 0 reserved for invalid/unassigned ppgtt */
xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1);
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(file_priv, i915, 0);
if (IS_ERR(pc)) {
err = PTR_ERR(pc);
goto err;
@@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void 
*data,
 
GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */
args->vm_id = id;
+   ppgtt->vm.fpriv = file_priv;
return 0;
 
 err_put:
@@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, 
void *data,
return -EIO;
}
 
-   ext_data.pc = proto_context_create(i915, args->flags);
+   ext_data.pc = proto_context_create(file->driver_priv, i915,
+  args->flags);
if (IS_ERR(ext_data.pc))
return PTR_ERR(ext_data.pc);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index cb78214a7dcd..c573c067779f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -188,6 +188,9 @@ struct i915_gem_proto_engine {
  * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE.
  */
 struct i915_gem_proto_context {
+   /** @fpriv: Client which creates the context */
+   struct drm_i915_file_private *fpriv;
+
/** @vm: See _gem_context.vm */
struct i915_address_space *vm;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c 
b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
index 8ac6726ec16b..125584ada282 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
@@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file)
int err;
u32 id;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(fpriv, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
@@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915,
struct i915_gem_context *ctx;
struct i915_gem_proto_context *pc;
 
-   pc = proto_context_create(i915, 0);
+   pc = proto_context_create(NULL, i915, 0);
if (IS_ERR(pc))
return ERR_CAST(pc);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h 
b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 4d6296cdbcfd..7192a534a654 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -248,6 +248,7 @@ struct i915_address_space {
struct drm_mm mm;
struct intel_gt *gt;
struct drm_i915_private *i915;
+   struct drm_i915_file_private *fpriv;
struct device *dma;
u64 total;  /* size addr space maps (ex. 2GB for ggtt) */
u64 reserved;   /* size addr space reserved */
-- 
2.39.2



[PULL] drm-intel-fixes

2023-07-27 Thread Tvrtko Ursulin
Hi Dave, Daniel,

Only two small fixes for the 6.5 RC this week - one display for display
(DPT) corruption under memory pressure, and one for selftests theoretical
edge case.

Regards,

Tvrtko

drm-intel-fixes-2023-07-27:
- Use shmem for dpt objects [dpt] (Radhakrishna Sripada)
- Fix an error handling path in igt_write_huge() (Christophe JAILLET)
The following changes since commit 6eaae198076080886b9e7d57f4ae06fa782f90ef:

  Linux 6.5-rc3 (2023-07-23 15:24:10 -0700)

are available in the Git repository at:

  git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-07-27

for you to fetch changes up to e354f67733115b4453268f61e6e072e9b1ea7a2f:

  drm/i915: Fix an error handling path in igt_write_huge() (2023-07-25 08:38:12 
+0100)


- Use shmem for dpt objects [dpt] (Radhakrishna Sripada)
- Fix an error handling path in igt_write_huge() (Christophe JAILLET)


Christophe JAILLET (1):
  drm/i915: Fix an error handling path in igt_write_huge()

Radhakrishna Sripada (1):
  drm/i915/dpt: Use shmem for dpt objects

 drivers/gpu/drm/i915/display/intel_dpt.c| 4 +++-
 drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 6 --
 2 files changed, 7 insertions(+), 3 deletions(-)


Re: [PATCH 16/17] cgroup/drm: Expose memory stats

2023-07-26 Thread Tvrtko Ursulin



On 21/07/2023 23:21, Tejun Heo wrote:

On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote:

   $ cat drm.memory.stat
   card0 region=system total=12898304 shared=0 active=0 resident=12111872 
purgeable=167936
   card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0

Data is generated on demand for simplicty of implementation ie. no running
totals are kept or accounted during migrations and such. Various
optimisations such as cheaper collection of data are possible but
deliberately left out for now.

Overall, the feature is deemed to be useful to container orchestration
software (and manual management).

Limits, either soft or hard, are not envisaged to be implemented on top of
this approach due on demand nature of collecting the stats.


So, yeah, if you want to add memory controls, we better think through how
the fd ownership migration should work.


It would be quite easy to make the implicit migration fail - just the 
matter of failing the first ioctl, which is what triggers the migration, 
after the file descriptor access from a new owner.


But I don't think I can really add that in the RFC given I have no hard 
controls or anything like that.


With GPU usage throttling it doesn't really apply, at least I don't 
think it does, since even when migrated to a lower budget group it would 
just get immediately de-prioritized.


I don't think hard GPU time limits are feasible in general, and while 
soft might be, again I don't see that any limiting would necessarily 
have to run immediately on implicit migration.


Second part of the story are hypothetical/future memory controls.

I think first thing to say is that implicit migration is important, but 
it is not really established to use the file descriptor from two places 
or to migrate more than once. It is simply fresh fd which gets sent to 
clients from Xorg, which is one of the legacy ways of doing things.


So we probably can just ignore that given no significant amount of 
memory ownership would be getting migrated.


And for drm.memory.stat I think what I have is good enough - both 
private and shared data get accounted, for any clients that have handles 
to particular buffers.


Maarten was working on memory controls so maybe he would have more 
thoughts on memory ownership and implicit migration.


But I don't think there is anything incompatible with that and 
drm.memory.stats as proposed here, given how the categories reported are 
the established ones from the DRM fdinfo spec, and it is fact of the 
matter that we can have multiple memory regions per driver.


The main thing that would change between this RFC and future memory 
controls in the area of drm.memory.stat is the implementation - it would 
have to get changed under the hood from "collect on query" to "account 
at allocation/free/etc". But that is just implementation details.


Regards,

Tvrtko


Re: [PATCH 16/17] cgroup/drm: Expose memory stats

2023-07-26 Thread Tvrtko Ursulin



On 26/07/2023 11:14, Maarten Lankhorst wrote:

Hey,

On 2023-07-22 00:21, Tejun Heo wrote:

On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote:

   $ cat drm.memory.stat
   card0 region=system total=12898304 shared=0 active=0 
resident=12111872 purgeable=167936
   card0 region=stolen-system total=0 shared=0 active=0 resident=0 
purgeable=0


Data is generated on demand for simplicty of implementation ie. no 
running

totals are kept or accounted during migrations and such. Various
optimisations such as cheaper collection of data are possible but
deliberately left out for now.

Overall, the feature is deemed to be useful to container orchestration
software (and manual management).

Limits, either soft or hard, are not envisaged to be implemented on 
top of

this approach due on demand nature of collecting the stats.


So, yeah, if you want to add memory controls, we better think through how
the fd ownership migration should work.
I've taken a look at the series, since I have been working on cgroup 
memory eviction.


The scheduling stuff will work for i915, since it has a purely software 
execlist scheduler, but I don't think it will work for GuC (firmware) 
scheduling or other drivers that use the generic drm scheduler.


It actually works - I used to have a blurb in the cover letter about it 
but apparently I dropped it. Just a bit less well with many clients, 
since there are fewer priority levels.


All that the design requires from the invididual drivers is some way to 
react to the "you are over budget by this much" signal. The rest is 
driver and backend specific.


For something like this,  you would probably want it to work inside the 
drm scheduler first. Presumably, this can be done by setting a weight on 
each runqueue, and perhaps adding a callback to update one for a running 
queue. Calculating the weights hierarchically might be fun..


It is not needed to work in drm scheduler first. In fact drm scheduler 
based drivers can plug into what I have since it already has the notion 
of scheduling priorities.


They would only need to implement a hook which allow the cgroup 
controller to query client GPU utilisation and another to received the 
over budget signal.


Amdgpu and msm AFAIK could be easy candidates because they both support 
per client utilisation and priorities.


Looks like I need to put all this info back into the cover letter.

Also, hierarchic weights and time budgets are all already there. What 
could be done later is make this all smarter and respect the time budget 
with more precision. That would however, in many cases including Intel, 
require co-operation with the firmware. In any case it is only work in 
the implementation, while the cgroup control interface remains the same.


I have taken a look at how the rest of cgroup controllers change 
ownership when moved to a different cgroup, and the answer was: not at 
all. If we attempt to create the scheduler controls only on the first 
time the fd is used, you could probably get rid of all the tracking.


Can you send a CPU file descriptor from process A to process B and have 
CPU usage belonging to process B show up in process' A cgroup, or 
vice-versa? Nope, I am not making any sense, am I? My point being it is 
not like-to-like, model is different.


No ownership transfer would mean in wide deployments all GPU utilisation 
would be assigned to Xorg and so there is no point to any of this. No 
way to throttle a cgroup with un-important GPU clients for instance.



This can be done very easily with the drm scheduler.

WRT memory, I think the consensus is to track system memory like normal 
memory. Stolen memory doesn't need to be tracked. It's kernel only 
memory, used for internal bookkeeping  only.


The only time userspace can directly manipulate stolen memory, is by 
mapping the pinned initial framebuffer to its own address space. The 
only allocation it can do is when a framebuffer is displayed, and 
framebuffer compression creates some stolen memory. Userspace is not

aware of this though, and has no way to manipulate those contents.


Stolen memory is irrelevant and not something cgroup controller knows 
about. Point is drivers say which memory regions they have and their 
utilisation.


Imagine instead of stolen it said vram0, or on Intel multi-tile it shows 
local0 and local1. People working with containers are interested to see 
this breakdown. I guess the parallel and use case here is closer to 
memory.numa_stat.


Regards,

Tvrtko


Re: [PATCH 15/17] cgroup/drm: Expose GPU utilisation

2023-07-25 Thread Tvrtko Ursulin



On 21/07/2023 23:20, Tejun Heo wrote:

On Fri, Jul 21, 2023 at 12:19:32PM -1000, Tejun Heo wrote:

On Wed, Jul 12, 2023 at 12:46:03PM +0100, Tvrtko Ursulin wrote:

+  drm.active_us
+   GPU time used by the group recursively including all child groups.


Maybe instead add drm.stat and have "usage_usec" inside? That'd be more
consistent with cpu side.


Could be, but no strong opinion from my side either way. Perhaps it boils down 
to what could be put in the file, I mean to decide whether keyed format makes 
sense or not.
 

Also, shouldn't this be keyed by the drm device?
 
It could have that too, or it could come later. Fun with GPUs that it not only could be keyed by the device, but also by the type of the GPU engine. (Which are a) vendor specific and b) some aree fully independent, some partially so, and some not at all - so it could get complicated semantics wise really fast.)


If for now I'd go with drm.stat/usage_usec containing the total time spent how 
would you suggest adding per device granularity? Files as documented are either 
flag or nested, not both at the same time. So something like:

usage_usec 10
card0 usage_usec 5
card1 usage_usec 5

Would or would not fly? Have two files along the lines of drm.stat and 
drm.dev_stat?

While on this general topic, you will notice that for memory stats I have _sort 
of_ nested keyed per device format, for example on integrated Intel GPU:

  $ cat drm.memory.stat
  card0 region=system total=12898304 shared=0 active=0 resident=12111872 
purgeable=167936
  card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0

If one a discrete Intel GPU two more lines would appear with memory regions of 
local and local-system. But then on some server class multi-tile GPUs even 
further regions with more than one device local memory region. And users do 
want to see this granularity for container use cases at least.

Anyway, this may not be compatible with the nested key format as documented in 
cgroup-v2.rst, although it does not explicitly say.

Should I cheat and create key names based on device and memory region name and 
let userspace parse it? Like:

  $ cat drm.memory.stat
  card0.system total=12898304 shared=0 active=0 resident=12111872 
purgeable=167936
  card0.stolen-system total=0 shared=0 active=0 resident=0 purgeable=0

Regards,

Tvrtko


Re: [PATCH 12/17] cgroup/drm: Introduce weight based drm cgroup control

2023-07-25 Thread Tvrtko Ursulin



On 21/07/2023 23:17, Tejun Heo wrote:

On Wed, Jul 12, 2023 at 12:46:00PM +0100, Tvrtko Ursulin wrote:

+DRM scheduling soft limits
+~~


Please don't say soft limits for this. It means something different for
memcg, so it gets really confusing. Call it "weight based CPU time control"
and maybe call the triggering points as thresholds.


Yes sorry, you said that before and I forgot to reword it all when 
re-spinning. I have now marked it as TODO in my email client so 
hopefully next time round I don't forget.


Regards,

Tvrtko


[PATCH v2] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap

2023-07-25 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
added a code path which does not map via GGTT, but was still setting the
ggtt write bit, and so triggering the GGTT flushing.

Fix it by not setting that bit unless the GGTT mapping path was used, and
replace the flush with wmb() in i915_vma_flush_writes().

This also works for the i915_gem_object_pin_map path added in
d976521a995a ("drm/i915: extend i915_vma_pin_iomap()").

It is hard to say if the fix has any observable effect, given that the
write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but
apart from code clarity, skipping the needless GGTT flushing could be
beneficial on platforms with non-coherent GGTT. (See the code flow in
intel_gt_flush_ggtt_writes().)

v2:
 * Improve comment in i915_vma_flush_writes(). (Andi)

Signed-off-by: Tvrtko Ursulin 
Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()")
Cc: Radhakrishna Sripada 
Cc:  # v5.14+
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/i915_vma.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index ffb425ba591c..7788b03b86d6 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
if (err)
goto err_unpin;
 
-   i915_vma_set_ggtt_write(vma);
+   if (!i915_gem_object_is_lmem(vma->obj) &&
+   i915_vma_is_map_and_fenceable(vma))
+   i915_vma_set_ggtt_write(vma);
 
/* NB Access through the GTT requires the device to be awake. */
return page_mask_bits(ptr);
@@ -615,8 +617,19 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 
 void i915_vma_flush_writes(struct i915_vma *vma)
 {
+   /*
+* i915_vma_iomap() could have mapped the underlying memory in one
+* of the three ways, depending on which we have to choose the most
+* appropriate flushing mechanism.
+*
+* If the mapping method was via the aperture the appropriate flag will
+* be set via i915_vma_set_ggtt_write(), and if not then we know it is
+* enough to simply flush the CPU side write-combine buffer.
+*/
if (i915_vma_unset_ggtt_write(vma))
intel_gt_flush_ggtt_writes(vma->vm->gt);
+   else
+   wmb();
 }
 
 void i915_vma_unpin_iomap(struct i915_vma *vma)
-- 
2.39.2



Re: [Intel-gfx] [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap

2023-07-25 Thread Tvrtko Ursulin



On 24/07/2023 21:16, Andi Shyti wrote:

Hi Tvrtko,

On Mon, Jul 24, 2023 at 01:56:33PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
added a code path which does not map via GGTT, but was still setting the
ggtt write bit, and so triggering the GGTT flushing.

Fix it by not setting that bit unless the GGTT mapping path was used, and
replace the flush with wmb() in i915_vma_flush_writes().

This also works for the i915_gem_object_pin_map path added in
d976521a995a ("drm/i915: extend i915_vma_pin_iomap()").

It is hard to say if the fix has any observable effect, given that the
write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but
apart from code clarity, skipping the needless GGTT flushing could be
beneficial on platforms with non-coherent GGTT. (See the code flow in
intel_gt_flush_ggtt_writes().)

Signed-off-by: Tvrtko Ursulin 
Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()")
Cc: Radhakrishna Sripada 
Cc:  # v5.14+
---
  drivers/gpu/drm/i915/i915_vma.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index ffb425ba591c..f2b626cd2755 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
if (err)
goto err_unpin;
  
-	i915_vma_set_ggtt_write(vma);

+   if (!i915_gem_object_is_lmem(vma->obj) &&
+   i915_vma_is_map_and_fenceable(vma))
+   i915_vma_set_ggtt_write(vma);
  
  	/* NB Access through the GTT requires the device to be awake. */

return page_mask_bits(ptr);
@@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma)
  {
if (i915_vma_unset_ggtt_write(vma))
intel_gt_flush_ggtt_writes(vma->vm->gt);
+   else
+   wmb(); /* Just flush the write-combine buffer. */


is flush the right word? Can you expand more the explanation in
this comment and why this point of synchronization is needed
here? (I am even wondering if it is really needed).


If you are hinting flush isn't the right word then I am not remembering 
what else do we use for it?


It is needed because i915_flush_writes()'s point AFAIU is to make sure 
CPU writes after i915_vma_pin_iomap() have landed in RAM. All three 
methods the latter can map the buffer are WC, therefore "flushing" of 
the WC buffer is needed for former to do something (what it promises).


Currently the wmb() is in intel_gt_flush_ggtt_writes(). But only one of 
the three mapping paths is via GGTT. So my logic is that calling it for 
paths not interacting with GGTT is confusing and not needed.



Anyway, it looks good:

Reviewed-by: Andi Shyti 


Thanks. If you don't see a hole in my logic I can improve the comment. I 
considered it initially but then thought it is obvious enough from 
looking at the i915_vma_pin_iomap. I can comment it more.


Regards,

Tvrtko



Andi


  }
  
  void i915_vma_unpin_iomap(struct i915_vma *vma)

--
2.39.2


Re: [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap

2023-07-25 Thread Tvrtko Ursulin



On 25/07/2023 00:38, Sripada, Radhakrishna wrote:

Hi Tvrtko,

The changes makes sense and based on the description looks good.
I am bit skeptical about the exec buffer failure reported by ci hence,
withholding the r-b for now. If you believe the CI failure is unrelated
please feel free to add my r-b.


This failure:
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_121236v1/shard-snb7/igt@gem_pp...@blt-vs-render-ctxn.html

Test or machine is not entirely stable looking at it's history, but with 
a couple different failure signatures:


https://intel-gfx-ci.01.org/tree/drm-tip/igt@gem_pp...@blt-vs-render-ctxn.html

But agreed that we need to be careful. I requested a re-run for a start.


On a side note on platforms with non-coherent ggtt do we really
need to use the barriers twice under intel_gt_flush_ggtt_writes?


You mean:

intel_gt_flush_ggtt_writes()
{
...
wmb();
...
intel_gt_chipset_flush();
wmb();

?

I'd guess it is not needed twice on the intel_gt_flush_ggtt_writes() 
path, but happens to be like that for direct callers of 
intel_gt_chipset_flush().


Maybe there is scope to tidy this all, for instance the first direct 
caller I opened does this:


rpcs_query_batch()
{
...
__i915_gem_object_flush_map(rpcs, 0, 64);
i915_gem_object_unpin_map(rpcs);

intel_gt_chipset_flush(vma->vm->gt);

Where I think __i915_gem_object_flush_map() could actually do the right 
thing and issue a flush appropriate for the mapping that was used. But 
it is work and double flush does not really harm. I don't think it does 
at least.


Regards,

Tvrtko



--Radhakrishna(RK) Sripada


-Original Message-
From: Tvrtko Ursulin 
Sent: Monday, July 24, 2023 5:57 AM
To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org
Cc: Ursulin, Tvrtko ; Sripada, Radhakrishna
; sta...@vger.kernel.org
Subject: [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of
i915_vma_pin_iomap

From: Tvrtko Ursulin 

Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is
available")
added a code path which does not map via GGTT, but was still setting the
ggtt write bit, and so triggering the GGTT flushing.

Fix it by not setting that bit unless the GGTT mapping path was used, and
replace the flush with wmb() in i915_vma_flush_writes().

This also works for the i915_gem_object_pin_map path added in
d976521a995a ("drm/i915: extend i915_vma_pin_iomap()").

It is hard to say if the fix has any observable effect, given that the
write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but
apart from code clarity, skipping the needless GGTT flushing could be
beneficial on platforms with non-coherent GGTT. (See the code flow in
intel_gt_flush_ggtt_writes().)

Signed-off-by: Tvrtko Ursulin 
Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is
available")
References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()")
Cc: Radhakrishna Sripada 
Cc:  # v5.14+
---
  drivers/gpu/drm/i915/i915_vma.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c
b/drivers/gpu/drm/i915/i915_vma.c
index ffb425ba591c..f2b626cd2755 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma
*vma)
if (err)
goto err_unpin;

-   i915_vma_set_ggtt_write(vma);
+   if (!i915_gem_object_is_lmem(vma->obj) &&
+   i915_vma_is_map_and_fenceable(vma))
+   i915_vma_set_ggtt_write(vma);

/* NB Access through the GTT requires the device to be awake. */
return page_mask_bits(ptr);
@@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma)
  {
if (i915_vma_unset_ggtt_write(vma))
intel_gt_flush_ggtt_writes(vma->vm->gt);
+   else
+   wmb(); /* Just flush the write-combine buffer. */
  }

  void i915_vma_unpin_iomap(struct i915_vma *vma)
--
2.39.2




[PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap

2023-07-24 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
added a code path which does not map via GGTT, but was still setting the
ggtt write bit, and so triggering the GGTT flushing.

Fix it by not setting that bit unless the GGTT mapping path was used, and
replace the flush with wmb() in i915_vma_flush_writes().

This also works for the i915_gem_object_pin_map path added in
d976521a995a ("drm/i915: extend i915_vma_pin_iomap()").

It is hard to say if the fix has any observable effect, given that the
write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but
apart from code clarity, skipping the needless GGTT flushing could be
beneficial on platforms with non-coherent GGTT. (See the code flow in
intel_gt_flush_ggtt_writes().)

Signed-off-by: Tvrtko Ursulin 
Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available")
References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()")
Cc: Radhakrishna Sripada 
Cc:  # v5.14+
---
 drivers/gpu/drm/i915/i915_vma.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index ffb425ba591c..f2b626cd2755 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
if (err)
goto err_unpin;
 
-   i915_vma_set_ggtt_write(vma);
+   if (!i915_gem_object_is_lmem(vma->obj) &&
+   i915_vma_is_map_and_fenceable(vma))
+   i915_vma_set_ggtt_write(vma);
 
/* NB Access through the GTT requires the device to be awake. */
return page_mask_bits(ptr);
@@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma)
 {
if (i915_vma_unset_ggtt_write(vma))
intel_gt_flush_ggtt_writes(vma->vm->gt);
+   else
+   wmb(); /* Just flush the write-combine buffer. */
 }
 
 void i915_vma_unpin_iomap(struct i915_vma *vma)
-- 
2.39.2



[PATCH] drm/i915: Tidy for_each_set_bit usage with abox_regs

2023-07-24 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

For_each_set_bit wants the max number of bits to walk and not the byte
storage size of the source field.

In this case there is no bug since abox_mask can mostly contain bits 0-2.

Another funny thing is that both sizeof(abox_mask), where abox_mask is
unsigned long, and BITS_PER_TYPE(DISPLAY_INFO->abox_mask)), are 8 (on
64-bit builds) so there is even less between them.

Anyway, why not make it explicit to what the constraint is.

Signed-off-by: Tvrtko Ursulin 
References: 62afef2811e4 ("drm/i915/rkl: RKL uses ABOX0 for pixel transfers")
Cc: Ville Syrjälä 
Cc: Aditya Swarup 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/display/intel_display_power.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c 
b/drivers/gpu/drm/i915/display/intel_display_power.c
index 38225e5d311e..27a484892908 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -1170,7 +1170,8 @@ static void icl_mbus_init(struct drm_i915_private 
*dev_priv)
if (DISPLAY_VER(dev_priv) == 12)
abox_regs |= BIT(0);
 
-   for_each_set_bit(i, _regs, sizeof(abox_regs))
+   for_each_set_bit(i, _regs,
+BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask))
intel_de_rmw(dev_priv, MBUS_ABOX_CTL(i), mask, val);
 }
 
@@ -1623,11 +1624,13 @@ static void tgl_bw_buddy_init(struct drm_i915_private 
*dev_priv)
if (table[config].page_mask == 0) {
drm_dbg(_priv->drm,
"Unknown memory configuration; disabling address buddy 
logic.\n");
-   for_each_set_bit(i, _mask, sizeof(abox_mask))
+   for_each_set_bit(i, _mask,
+
BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask))
intel_de_write(dev_priv, BW_BUDDY_CTL(i),
   BW_BUDDY_DISABLE);
} else {
-   for_each_set_bit(i, _mask, sizeof(abox_mask)) {
+   for_each_set_bit(i, _mask,
+
BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask)) {
intel_de_write(dev_priv, BW_BUDDY_PAGE_MASK(i),
   table[config].page_mask);
 
-- 
2.39.2



[PATCH] drm/i915: Use the i915_vma_flush_writes helper

2023-07-21 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We can use the existing helper in flush_write_domain() and save some lines
of code.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index dfaaa8b66ac3..ffddec1d2a76 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -68,10 +68,8 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned 
int flush_domains)
switch (obj->write_domain) {
case I915_GEM_DOMAIN_GTT:
spin_lock(>vma.lock);
-   for_each_ggtt_vma(vma, obj) {
-   if (i915_vma_unset_ggtt_write(vma))
-   intel_gt_flush_ggtt_writes(vma->vm->gt);
-   }
+   for_each_ggtt_vma(vma, obj)
+   i915_vma_flush_writes(vma);
spin_unlock(>vma.lock);
 
i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU);
-- 
2.39.2



Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling

2023-07-21 Thread Tvrtko Ursulin



On 21/07/2023 05:28, Yang, Fei wrote:

[snip]

@@ -27,15 +28,8 @@ static bool gpu_write_needs_clflush(struct
drm_i915_gem_object *obj)


The code change here looks accurate, but while we're here, I have a
side question about this function in general...it was originally
introduced in commit 48004881f693 ("drm/i915: Mark CPU cache as
dirty when used for
rendering") which states that GPU rendering ends up in the CPU cache
(and thus needs a clflush later to make sure it lands in memory).
That makes sense to me for LLC platforms, but is it really true for
non-LLC snooping platforms (like MTL) as the commit states?


For non-LLC platforms objects can be set to 1-way coherent which
means GPU rendering ending up in CPU cache as well, so for non-LLC
platform the logic here should be checking 1-way coherent flag.


That's the part that I'm questioning (and not just for MTL, but for
all of our other non-LLC platforms too).  Just because there's
coherency doesn't mean that device writes landed in the CPU cache.
Coherency is also achieved if device writes invalidate the contents of the CPU 
cache.
I thought our non-LLC snooping platforms were coherent due to
write-invalidate rather than write-update, but I can't find it
specifically documented anywhere at the moment.  If write-invalidate
was used, then there shouldn't be a need for a later clflush either.


[Trying to consolidate by doing a combined reply to the discussion so far.]

On the write-invalidate vs write-update I don't know. If you did not
find it in bspec then I doubt I would. I can have a browse still.


Matt was correct. Quote Ron Silvas from SW ARCH, "MTL GPU doesn't write to
CPU cache, it simply snoop CPU cache on its way to RAM."


Does it apply to all snooping platforms?

And for the cache level/mode based condition, how about replacing it with this:

/*
 * Fully coherent cached access may end up with data in the CPU cache
 * which hasn't hit memory yet.
 */
return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) &&
   i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W);

?

Although that would mean old I915_CACHE_LLC on old platforms is actually 2-way 
coherent.

I am struggling to find a comprehensive explanation in bspec, but for instance 
605 makes it sounds like it is fully coherent. Perhaps it really is and I 
should fix the legacy and Gen12 table..

And if the write-invalidate applies to all snooping platforms then we extend it 
to:

/*
 * Fully coherent cached access may end up with data in the CPU cache
 * which hasn't hit memory yet.
 *
 * But not on snooping platforms, where it is impossible due
 * write-invalidate.
 */
return !HAS_SNOOP(i915) &&
   (i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) &&
i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W));

That would prevent any flushing on MTL and make you happy from that aspect.

In fact, the snooping check could be before the cache mode check.

For i915_gem_object_can_bypass_llc it would be ideal if a condition based on 
the absence of I915_BO_CACHE_COHERENT_FOR_WRITE would work. At least according 
to the kerneldoc for @cache_coherent:

 * I915_BO_CACHE_COHERENT_FOR_WRITE:
 *
 * When writing through the CPU cache, the GPU is still coherent. Note
 * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.

So for objects without it set, we need to force a flush.

And make __i915_gem_object_update_coherency not set it for WB without 1-way 
coherency set.

According to bspec that would seem correct, because with 1-way snooping on MTL, 
GPU snoops the IA until first GPU access. So anything the CPU writes before the 
first GPU access would be coherent and so no need to flush in set pages. But if 
non-coherent WB is set then we need to flush.

I'll trybot it is and see what happens.


My understanding
was that snooping platforms just invalidated the CPU cache to
prevent future CPU reads from seeing stale data but didn't actually
stick any new data in there?  Am I off track or is the original
logic of this function not quite right?

Anyway, even if the logic of this function is wrong, it's a mistake
that would only hurt performance


Yes, this logic will introduce performance impact because it's
missing the checking for obj->pat_set_by_user. For objects with
pat_set_by_user==true, even if the object is snooping or 1-way
coherent, we don't want to enforce a clflush here since the
coherency is supposed to be handled by user space.


What should I add you think to fix it?


I think the simplest would be

 if (obj->pat_set_by_user)
 return false;

because even checking for incoherent WB is unnecessary, simply no
need for the KMD to initiate a flush if PAT is set by user.


Add a check for non-coherent WB in gpu_write_needs_clflush as an additional 
condition for returning 

Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling

2023-07-20 Thread Tvrtko Ursulin



[Here let me just focus on the points which did not get further discussion in 
follow ups yet.]

On 19/07/2023 23:31, Matt Roper wrote:

On Wed, Jul 19, 2023 at 01:37:30PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few
questionable design decisions which this patch tries to improve upon.

Principal change is to invert the per platform cache level to PAT index
table which was added by the referenced commit, and by doing so enable
i915 to understand the cache mode between PAT indices, changing them from
opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level.

Other changes/fixes/improvements we are able to do:

1)
Replace the enum i915_cache_level with i915_cache_t, composed of a more
detailed representation of each cache mode (base mode plus flags).

For instance this way we are able to express the difference between WB and
1-way coherent WB on Meteorlake. Which in turn enables us to map the i915
"cached" mode to the correct Meteorlake PAT index.

2)
We can cache PAT indices of the caching modes used by the driver itself in
struct drm_i915_private, which eliminates the runtime calls to
i915_gem_get_pat_index from both high- and low-level i915 components.

3)
We can also cache the caching modes used by the driver for coherent
access and for display buffers.

4)
Remove the incorrect references to enum i915_cache_level from low level
PTE encode vfuncs, since those are actually given PAT indices by their
callers.

5)
Because i915 now understands PAT indices, we can remove the overly
aggressive flushing triggered from i915_gem_object_can_bypass_llc() and
limit it to non-coherent write-back mode only.

6)
Finally we are able to replace the platform dependent cache mode to string
code in debugfs and elsewhere by the single implementation based on
i915_cache_t.

v2:
  * Fix PAT-to-cache-mode table for PVC. (Fei)
  * Cache display caching mode too. (Fei)
  * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
  * Checkpath issues.
  * Cache mode flags check fixed.

Signed-off-by: Tvrtko Ursulin 
Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
  drivers/gpu/drm/i915/Makefile |   1 +
  .../drm/i915/display/intel_plane_initial.c|   3 +-
  drivers/gpu/drm/i915/gem/i915_gem_domain.c|  56 ---
  drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  13 +-
  drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   4 +-
  drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  12 +-
  drivers/gpu/drm/i915/gem/i915_gem_object.c| 152 +++---
  drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
  drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  11 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  44 ++---
  drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
  .../drm/i915/gem/selftests/huge_gem_object.c  |   4 +-
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |   6 +-
  drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |   4 +-
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  19 +--
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++--
  drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
  drivers/gpu/drm/i915/gt/intel_migrate.c   |  11 +-
  drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
  .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
  drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
  drivers/gpu/drm/i915/gt/selftest_migrate.c|   9 +-
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  14 +-
  drivers/gpu/drm/i915/gt/selftest_tlb.c|   5 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |   8 +-
  drivers/gpu/drm/i915/i915_cache.c |  91 +++
  drivers/gpu/drm/i915/i915_cache.h |  60 +++
  drivers/gpu/drm/i915/i915_debugfs.c   |  53 +-
  drivers/gpu/drm/i915/i915_driver.c|   5 +
  drivers/gpu/drm/i915/i915_drv.h   |   5 +
  drivers/gpu/drm/i915/i915_gem.c   |  21 +--
  drivers/gpu/drm/i915/i915_gpu_error.c |   7 +-
  drivers/gpu/drm/i915/i915_pci.c   |  82 +-
  drivers/gpu/drm/i915/i915_perf.c  |   2 +-
  

Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling

2023-07-20 Thread Tvrtko Ursulin



On 20/07/2023 01:22, Matt Roper wrote:

On Wed, Jul 19, 2023 at 05:07:15PM -0700, Yang, Fei wrote:

[snip]

@@ -27,15 +28,8 @@ static bool gpu_write_needs_clflush(struct 
drm_i915_gem_object *obj)


The code change here looks accurate, but while we're here, I have a side
question about this function in general...it was originally introduced
in commit 48004881f693 ("drm/i915: Mark CPU cache as dirty when used for
rendering") which states that GPU rendering ends up in the CPU cache
(and thus needs a clflush later to make sure it lands in memory).  That
makes sense to me for LLC platforms, but is it really true for non-LLC
snooping platforms (like MTL) as the commit states?


For non-LLC platforms objects can be set to 1-way coherent which means
GPU rendering ending up in CPU cache as well, so for non-LLC platform
the logic here should be checking 1-way coherent flag.


That's the part that I'm questioning (and not just for MTL, but for all
of our other non-LLC platforms too).  Just because there's coherency
doesn't mean that device writes landed in the CPU cache.  Coherency is
also achieved if device writes invalidate the contents of the CPU cache.
I thought our non-LLC snooping platforms were coherent due to
write-invalidate rather than write-update, but I can't find it
specifically documented anywhere at the moment.  If write-invalidate was
used, then there shouldn't be a need for a later clflush either.


[Trying to consolidate by doing a combined reply to the discussion so far.]

On the write-invalidate vs write-update I don't know. If you did not 
find it in bspec then I doubt I would. I can have a browse still.



My understanding
was that snooping platforms just invalidated the CPU cache to prevent
future CPU reads from seeing stale data but didn't actually stick any
new data in there?  Am I off track or is the original logic of this
function not quite right?

Anyway, even if the logic of this function is wrong, it's a mistake that
would only hurt performance


Yes, this logic will introduce performance impact because it's missing the
checking for obj->pat_set_by_user. For objects with pat_set_by_user==true,
even if the object is snooping or 1-way coherent, we don't want to enforce
a clflush here since the coherency is supposed to be handled by user space.


What should I add you think to fix it?

Add a check for non-coherent WB in gpu_write_needs_clflush as an 
additional condition for returning false?


And then if Matt is correct write-invalidate is used also !HAS_LLC 
should just return false?



(flushing more often than we truly need to)
rather than functionality, so not something we really need to dig into
right now as part of this patch.


  if (IS_DGFX(i915))
  return false;

-/*
- * For objects created by userspace through GEM_CREATE with pat_index
- * set by set_pat extension, i915_gem_object_has_cache_level() will
- * always return true, because the coherency of such object is managed
- * by userspace. Othereise the call here would fall back to checking
- * whether the object is un-cached or write-through.
- */
-return !(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) ||
- i915_gem_object_has_cache_level(obj, I915_CACHE_WT));
+return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) != 1 &&
+   i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WT) != 1;
  }


[snip]

@@ -640,15 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache 
*cache,
  if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
  return false;

-/*
- * For objects created by userspace through GEM_CREATE with pat_index
- * set by set_pat extension, i915_gem_object_has_cache_level() always
- * return true, otherwise the call would fall back to checking whether
- * the object is un-cached.
- */
  return (cache->has_llc ||
  obj->cache_dirty ||
-!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE));
+i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) != 1);


Platforms with relocations and platforms with user-specified PAT have no
overlap, right?  So a -1 return should be impossible here and this is
one case where we could just treat the return value as a boolean, right?




Hm no, or maybe. My thinking behind tri-state is to allow a safe option 
for "don't know". In case PAT index to cache mode table is not fully 
populated on some future platform.



My understanding is that the condition here means to say that, if GPU
access is uncached, don't use CPU reloc because the CPU cache might
contain stale data. This condition is sufficient for snooping platforms.
But from MTL onward, the condition show be whether the GPU access is
coherent with CPU. So, we should be checking 1-way coherent flag instead
of UC mode, because even if the GPU access is WB, it's still non-coherent,
thus CPU cache could be out-dated.


Honestly the matrix of caching decision/logic 

Re: [RFC v5 00/17] DRM cgroup controller with scheduling control and memory stats

2023-07-20 Thread Tvrtko Ursulin



Hi,

On 19/07/2023 21:31, T.J. Mercier wrote:

On Wed, Jul 12, 2023 at 4:47 AM Tvrtko Ursulin
 wrote:


   drm.memory.stat
 A nested file containing cumulative memory statistics for the whole
 sub-hierarchy, broken down into separate GPUs and separate memory
 regions supported by the latter.

 For example::

   $ cat drm.memory.stat
   card0 region=system total=12898304 shared=0 active=0 
resident=12111872 purgeable=167936
   card0 region=stolen-system total=0 shared=0 active=0 resident=0 
purgeable=0

 Card designation corresponds to the DRM device names and multiple line
 entries can be present per card.

 Memory region names should be expected to be driver specific with the
 exception of 'system' which is standardised and applicable for GPUs
 which can operate on system memory buffers.

 Sub-keys 'resident' and 'purgeable' are optional.

 Per category region usage is reported in bytes.

  * Feedback from people interested in drm.active_us and drm.memory.stat is
required to understand the use cases and their usefulness (of the fields).

Memory stats are something which was easy to add to my series, since I was
already working on the fdinfo memory stats patches, but the question is how
useful it is.


Hi Tvrtko,

I think this style of driver-defined categories for reporting of
memory could potentially allow us to eliminate the GPU memory tracking
tracepoint used on Android (gpu_mem_total). This would involve reading
drm.memory.stat at the root cgroup (I see it's currently disabled on


I can put it available under root too, don't think there is any 
technical reason to not have it. In fact, now that I look at it again, 
memory.stat is present on root so that would align with my general 
guideline to keep the two as similar as possible.



the root), which means traversing the whole cgroup tree under the
cgroup lock to generate the values on-demand. This would be done
rarely, but I still wonder what the cost of that would turn out to be.


Yeah that's ugly. I could eliminate cgroup_lock by being a bit smarter. 
Just didn't think it worth it for the RFC.


Basically to account memory stats for any sub-tree I need the equivalent 
one struct drm_memory_stats per DRM device present in the hierarchy. So 
I could pre-allocate a few and restart if run out of spares, or 
something. They are really small so pre-allocating a good number, based 
on past state or something, should would good enough. Or even total 
number of DRM devices in a system as a pessimistic and safe option for 
most reasonable deployments.



The drm_memory_stats categories in the output don't seem like a big
value-add for this use-case, but no real objection to them being


You mean the fact there are different categories is not a value add for 
your use case because you would only use one?


The idea was to align 1:1 with DRM memory stats fdinfo and somewhat 
emulate how memory.stat also offers a breakdown.



there. I know it's called the DRM cgroup controller, but it'd be nice
if there were a way to make the mem tracking part work for any driver
that wishes to participate as many of our devices don't use a DRM
driver. But making that work doesn't look like it would fit very


Ah that would be a challenge indeed to which I don't have any answers 
right now.


Hm if you have a DRM device somewhere in the chain memory stats would 
still show up. Like if you had a dma-buf producer which is not a DRM 
driver, but then that buffer was imported by a DRM driver, it would show 
up in a cgroup. Or vice-versa. But if there aren't any in the whole 
chain then it would not.



cleanly into this controller, so I'll just shut up now.


Not all all, good feedback!

Regards,

Tvrtko


Re: [PATCH 2/2] drm/i915: Avoid -Wconstant-logical-operand in nsecs_to_jiffies_timeout()

2023-07-20 Thread Tvrtko Ursulin



On 18/07/2023 22:44, Nathan Chancellor wrote:

A proposed update to clang's -Wconstant-logical-operand to warn when the
left hand side is a constant shows the following instance in
nsecs_to_jiffies_timeout() when NSEC_PER_SEC is not a multiple of HZ,
such as CONFIG_HZ=300:

   drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: warning: use of logical 
'&&' with constant operand [-Wconstant-logical-operand]
 189 | if (NSEC_PER_SEC % HZ &&
 | ~ ^
   drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: note: use '&' for a bitwise 
operation
 189 | if (NSEC_PER_SEC % HZ &&
 |   ^~
 |   &
   drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: note: remove constant to 
silence this warning
   1 warning generated.

Turn this into an explicit comparison against zero to make the
expression a boolean to make it clear this should be a logical check,
not a bitwise one.


So -Wconstant-logical-operand only triggers when it is a constant but 
not zero constant? Why does that make sense is not a kludge to avoid too 
much noise?


Personally, it all feels a bit over the top as a warning,  since code in 
both cases should optimise away. And we may end up papering over it if 
it becomes a default.


Then again this patch IMO does make the code more readable, so I am 
happy to take this one via our tree. Or either give ack to bring it in 
via drm-misc-next:


Acked-by: Tvrtko Ursulin 

Let me know which route works best.

Regards,

Tvrtko


Link: https://reviews.llvm.org/D142609
Signed-off-by: Nathan Chancellor 
---
  drivers/gpu/drm/i915/gem/i915_gem_wait.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c 
b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
index 4a33ad2d122b..d4b918fb11ce 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c
@@ -186,7 +186,7 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
  static inline unsigned long nsecs_to_jiffies_timeout(const u64 n)
  {
/* nsecs_to_jiffies64() does not guard against overflow */
-   if (NSEC_PER_SEC % HZ &&
+   if ((NSEC_PER_SEC % HZ) != 0 &&
div_u64(n, NSEC_PER_SEC) >= MAX_JIFFY_OFFSET / HZ)
return MAX_JIFFY_OFFSET;
  



[PULL] drm-intel-fixes

2023-07-20 Thread Tvrtko Ursulin
Hi Dave, Daniel,

Only two fixes for the 6.5 rc window this week - one perf/OA use after
free on Xe_HP platforms and one defconfig build fix for GCC versions older
than 8.

Regards,

Tvrtko

drm-intel-fixes-2023-07-20:
- Add sentinel to xehp_oa_b_counters [perf] (Andrzej Hajda)
- Revert "drm/i915: use localized __diag_ignore_all() instead of per file" 
(Jani Nikula)
The following changes since commit fdf0eaf11452d72945af31804e2a1048ee1b574c:

  Linux 6.5-rc2 (2023-07-16 15:10:37 -0700)

are available in the Git repository at:

  git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-07-20

for you to fetch changes up to 2c27770a7bc88ef7f6614d11d96d8e62017d0b78:

  Revert "drm/i915: use localized __diag_ignore_all() instead of per file" 
(2023-07-17 13:39:04 +0100)


- Add sentinel to xehp_oa_b_counters [perf] (Andrzej Hajda)
- Revert "drm/i915: use localized __diag_ignore_all() instead of per file" 
(Jani Nikula)


Andrzej Hajda (1):
  drm/i915/perf: add sentinel to xehp_oa_b_counters

Jani Nikula (1):
  Revert "drm/i915: use localized __diag_ignore_all() instead of per file"

 drivers/gpu/drm/i915/Makefile   | 5 +
 drivers/gpu/drm/i915/display/intel_display_device.c | 5 -
 drivers/gpu/drm/i915/display/intel_fbdev.c  | 5 -
 drivers/gpu/drm/i915/i915_pci.c | 5 -
 drivers/gpu/drm/i915/i915_perf.c| 1 +
 5 files changed, 6 insertions(+), 15 deletions(-)


[PATCH v3] drm/i915: Refactor PAT/object cache handling

2023-07-19 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few
questionable design decisions which this patch tries to improve upon.

Principal change is to invert the per platform cache level to PAT index
table which was added by the referenced commit, and by doing so enable
i915 to understand the cache mode between PAT indices, changing them from
opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level.

Other changes/fixes/improvements we are able to do:

1)
Replace the enum i915_cache_level with i915_cache_t, composed of a more
detailed representation of each cache mode (base mode plus flags).

For instance this way we are able to express the difference between WB and
1-way coherent WB on Meteorlake. Which in turn enables us to map the i915
"cached" mode to the correct Meteorlake PAT index.

2)
We can cache PAT indices of the caching modes used by the driver itself in
struct drm_i915_private, which eliminates the runtime calls to
i915_gem_get_pat_index from both high- and low-level i915 components.

3)
We can also cache the caching modes used by the driver for coherent
access and for display buffers.

4)
Remove the incorrect references to enum i915_cache_level from low level
PTE encode vfuncs, since those are actually given PAT indices by their
callers.

5)
Because i915 now understands PAT indices, we can remove the overly
aggressive flushing triggered from i915_gem_object_can_bypass_llc() and
limit it to non-coherent write-back mode only.

6)
Finally we are able to replace the platform dependent cache mode to string
code in debugfs and elsewhere by the single implementation based on
i915_cache_t.

v2:
 * Fix PAT-to-cache-mode table for PVC. (Fei)
 * Cache display caching mode too. (Fei)
 * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

v3:
 * Checkpath issues.
 * Cache mode flags check fixed.

Signed-off-by: Tvrtko Ursulin 
Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/Makefile |   1 +
 .../drm/i915/display/intel_plane_initial.c|   3 +-
 drivers/gpu/drm/i915/gem/i915_gem_domain.c|  56 ---
 drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  13 +-
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   4 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  12 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 152 +++---
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  11 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  44 ++---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
 .../drm/i915/gem/selftests/huge_gem_object.c  |   4 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |   6 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |   4 +-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  19 +--
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++--
 drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
 drivers/gpu/drm/i915/gt/intel_migrate.c   |  11 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c|   9 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  14 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|   5 +-
 .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |   8 +-
 drivers/gpu/drm/i915/i915_cache.c |  91 +++
 drivers/gpu/drm/i915/i915_cache.h |  60 +++
 drivers/gpu/drm/i915/i915_debugfs.c   |  53 +-
 drivers/gpu/drm/i915/i915_driver.c|   5 +
 drivers/gpu/drm/i915/i915_drv.h   |   5 +
 drivers/gpu/drm/i915/i915_gem.c   |  21 +--
 drivers/gpu/drm/i915/i915_gpu_error.c |   7 +-
 drivers/gpu/drm/i915/i915_pci.c   |  82 +-
 drivers/gpu/drm/i915/i915_perf.c  |   2 +-
 drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
 drivers/gpu/drm/i915/selftests/i915_gem.c |   5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |   8 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  13 +-
 drivers/gpu/drm/i915/selftes

[PATCH v2] drm/i915: Refactor PAT/object cache handling

2023-07-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has
introduced PAT indices to i915 internal APIs, partially replacing the
usage of driver internal cache_level, but has also added a few
questionable design decisions which this patch tries to improve upon.

Principal change is to invert the per platform cache level to PAT index
table which was added by the referenced commit, and by doing so enable
i915 to understand the cache mode between PAT indices, changing them from
opaque to transparent.

Once we have the inverted table we are able to remove the hidden false
"return true" from i915_gem_object_has_cache_level.

Other changes/fixes/improvements we are able to do:

1)
Replace the enum i915_cache_level with i915_cache_t, composed of a more
detailed representation of each cache mode (base mode plus flags).

For instance this way we are able to express the difference between WB and
1-way coherent WB on Meteorlake. Which in turn enables us to map the i915
"cached" mode to the correct Meteorlake PAT index.

2)
We can cache PAT indices of the caching modes used by the driver itself in
struct drm_i915_private, which eliminates the runtime calls to
i915_gem_get_pat_index from both high- and low-level i915 components.

3)
We can also cache the caching modes used by the driver for coherent
access and for display buffers.

4)
Remove the incorrect references to enum i915_cache_level from low level
PTE encode vfuncs, since those are actually given PAT indices by their
callers.

5)
Because i915 now understands PAT indices, we can remove the overly
aggressive flushing triggered from i915_gem_object_can_bypass_llc() and
limit it to non-coherent write-back mode only.

6)
Finally we are able to replace the platform dependent cache mode to string
code in debugfs and elsewhere by the single implementation based on
i915_cache_t.

v2:
 * Fix PAT-to-cache-mode table for PVC. (Fei)
 * Cache display caching mode too. (Fei)
 * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt)

Signed-off-by: Tvrtko Ursulin 
Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level")
Cc: Chris Wilson 
Cc: Fei Yang 
Cc: Andi Shyti 
Cc: Matt Roper 
---
 drivers/gpu/drm/i915/Makefile |   1 +
 .../drm/i915/display/intel_plane_initial.c|   3 +-
 drivers/gpu/drm/i915/gem/i915_gem_domain.c|  56 +++
 drivers/gpu/drm/i915/gem/i915_gem_domain.h|   5 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  13 +-
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   4 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  12 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 147 ++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  11 +-
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 116 +-
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |   8 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  11 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  44 +++---
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
 .../drm/i915/gem/selftests/huge_gem_object.c  |   4 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |   6 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |   4 +-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  19 +--
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++--
 drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c |   4 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   |   3 +-
 drivers/gpu/drm/i915/gt/intel_migrate.c   |  11 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |   6 +-
 .../gpu/drm/i915/gt/intel_ring_submission.c   |   4 +-
 drivers/gpu/drm/i915/gt/intel_timeline.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c|   9 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  14 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|   5 +-
 .../gpu/drm/i915/gt/selftest_workarounds.c|   2 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |   8 +-
 drivers/gpu/drm/i915/i915_cache.c |  92 +++
 drivers/gpu/drm/i915/i915_cache.h |  61 
 drivers/gpu/drm/i915/i915_debugfs.c   |  53 +--
 drivers/gpu/drm/i915/i915_driver.c|   5 +
 drivers/gpu/drm/i915/i915_drv.h   |   5 +
 drivers/gpu/drm/i915/i915_gem.c   |  21 +--
 drivers/gpu/drm/i915/i915_gpu_error.c |   7 +-
 drivers/gpu/drm/i915/i915_pci.c   |  82 +-
 drivers/gpu/drm/i915/i915_perf.c  |   2 +-
 drivers/gpu/drm/i915/intel_device_info.h  |   6 +-
 drivers/gpu/drm/i915/selftests/i915_gem.c |   5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |   8 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  13 +-
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |   2 +-
 .../drm/i915/selftests/intel_mem

[CI 4/4] drm/i915: Expose RPS thresholds in sysfs

2023-07-17 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

User feedback indicates significant performance gains are possible in
specific games with non default RPS up/down thresholds.

Expose these tunables via sysfs which will allow users to achieve best
performance when running games and best power efficiency elsewhere.

Note this patch supports non GuC based platforms only.

v2:
 * Make checkpatch happy.

Signed-off-by: Tvrtko Ursulin 
References: https://gitlab.freedesktop.org/drm/intel/-/issues/8389
Cc: Rodrigo Vivi 
Reviewed-by: Rodrigo Vivi 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c | 108 
 1 file changed, 108 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
index ee2b44f896a2..f0dea54880af 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
@@ -700,6 +700,80 @@ static const struct attribute *media_perf_power_attrs[] = {
NULL
 };
 
+static ssize_t
+rps_up_threshold_pct_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+   struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name);
+   struct intel_rps *rps = >rps;
+
+   return sysfs_emit(buf, "%u\n", intel_rps_get_up_threshold(rps));
+}
+
+static ssize_t
+rps_up_threshold_pct_store(struct kobject *kobj, struct kobj_attribute *attr,
+  const char *buf, size_t count)
+{
+   struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name);
+   struct intel_rps *rps = >rps;
+   int ret;
+   u8 val;
+
+   ret = kstrtou8(buf, 10, );
+   if (ret)
+   return ret;
+
+   ret = intel_rps_set_up_threshold(rps, val);
+
+   return ret == 0 ? count : ret;
+}
+
+static struct kobj_attribute rps_up_threshold_pct =
+   __ATTR(rps_up_threshold_pct,
+  0664,
+  rps_up_threshold_pct_show,
+  rps_up_threshold_pct_store);
+
+static ssize_t
+rps_down_threshold_pct_show(struct kobject *kobj, struct kobj_attribute *attr,
+   char *buf)
+{
+   struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name);
+   struct intel_rps *rps = >rps;
+
+   return sysfs_emit(buf, "%u\n", intel_rps_get_down_threshold(rps));
+}
+
+static ssize_t
+rps_down_threshold_pct_store(struct kobject *kobj, struct kobj_attribute *attr,
+const char *buf, size_t count)
+{
+   struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name);
+   struct intel_rps *rps = >rps;
+   int ret;
+   u8 val;
+
+   ret = kstrtou8(buf, 10, );
+   if (ret)
+   return ret;
+
+   ret = intel_rps_set_down_threshold(rps, val);
+
+   return ret == 0 ? count : ret;
+}
+
+static struct kobj_attribute rps_down_threshold_pct =
+   __ATTR(rps_down_threshold_pct,
+  0664,
+  rps_down_threshold_pct_show,
+  rps_down_threshold_pct_store);
+
+static const struct attribute * const gen6_gt_rps_attrs[] = {
+   _up_threshold_pct.attr,
+   _down_threshold_pct.attr,
+   NULL
+};
+
 static ssize_t
 default_min_freq_mhz_show(struct kobject *kobj, struct kobj_attribute *attr, 
char *buf)
 {
@@ -722,9 +796,37 @@ default_max_freq_mhz_show(struct kobject *kobj, struct 
kobj_attribute *attr, cha
 static struct kobj_attribute default_max_freq_mhz =
 __ATTR(rps_max_freq_mhz, 0444, default_max_freq_mhz_show, NULL);
 
+static ssize_t
+default_rps_up_threshold_pct_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+   struct intel_gt *gt = kobj_to_gt(kobj->parent);
+
+   return sysfs_emit(buf, "%u\n", gt->defaults.rps_up_threshold);
+}
+
+static struct kobj_attribute default_rps_up_threshold_pct =
+__ATTR(rps_up_threshold_pct, 0444, default_rps_up_threshold_pct_show, NULL);
+
+static ssize_t
+default_rps_down_threshold_pct_show(struct kobject *kobj,
+   struct kobj_attribute *attr,
+   char *buf)
+{
+   struct intel_gt *gt = kobj_to_gt(kobj->parent);
+
+   return sysfs_emit(buf, "%u\n", gt->defaults.rps_down_threshold);
+}
+
+static struct kobj_attribute default_rps_down_threshold_pct =
+__ATTR(rps_down_threshold_pct, 0444, default_rps_down_threshold_pct_show, 
NULL);
+
 static const struct attribute * const rps_defaults_attrs[] = {
_min_freq_mhz.attr,
_max_freq_mhz.attr,
+   _rps_up_threshold_pct.attr,
+   _rps_down_threshold_pct.attr,
NULL
 };
 
@@ -752,6 +854,12 @@ static int intel_sysfs_rps_init(struct intel_gt *gt, 
struct kobject *kobj)
if (IS_VALLEYVIEW(gt->i915) || IS_CHERRYVIEW(gt->i915))
ret = sysfs_create_file(kobj, vlv_attr);
 
+  

[CI 3/4] drm/i915: Add helpers for managing rps thresholds

2023-07-17 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

In preparation for exposing via sysfs add helpers for managing rps
thresholds.

v2:
 * Force sw and hw re-programming on threshold change.

Signed-off-by: Tvrtko Ursulin 
Cc: Rodrigo Vivi 
Reviewed-by: Rodrigo Vivi 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 54 +
 drivers/gpu/drm/i915/gt/intel_rps.h |  4 +++
 2 files changed, 58 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c 
b/drivers/gpu/drm/i915/gt/intel_rps.c
index 69847f919586..092542f53aad 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -16,7 +16,9 @@
 #include "intel_gt.h"
 #include "intel_gt_clock_utils.h"
 #include "intel_gt_irq.h"
+#include "intel_gt_pm.h"
 #include "intel_gt_pm_irq.h"
+#include "intel_gt_print.h"
 #include "intel_gt_regs.h"
 #include "intel_mchbar_regs.h"
 #include "intel_pcode.h"
@@ -2576,6 +2578,58 @@ int intel_rps_set_min_frequency(struct intel_rps *rps, 
u32 val)
return set_min_freq(rps, val);
 }
 
+u8 intel_rps_get_up_threshold(struct intel_rps *rps)
+{
+   return rps->power.up_threshold;
+}
+
+static int rps_set_threshold(struct intel_rps *rps, u8 *threshold, u8 val)
+{
+   int ret;
+
+   if (val > 100)
+   return -EINVAL;
+
+   ret = mutex_lock_interruptible(>lock);
+   if (ret)
+   return ret;
+
+   if (*threshold == val)
+   goto out_unlock;
+
+   *threshold = val;
+
+   /* Force reset. */
+   rps->last_freq = -1;
+   mutex_lock(>power.mutex);
+   rps->power.mode = -1;
+   mutex_unlock(>power.mutex);
+
+   intel_rps_set(rps, clamp(rps->cur_freq,
+rps->min_freq_softlimit,
+rps->max_freq_softlimit));
+
+out_unlock:
+   mutex_unlock(>lock);
+
+   return ret;
+}
+
+int intel_rps_set_up_threshold(struct intel_rps *rps, u8 threshold)
+{
+   return rps_set_threshold(rps, >power.up_threshold, threshold);
+}
+
+u8 intel_rps_get_down_threshold(struct intel_rps *rps)
+{
+   return rps->power.down_threshold;
+}
+
+int intel_rps_set_down_threshold(struct intel_rps *rps, u8 threshold)
+{
+   return rps_set_threshold(rps, >power.down_threshold, threshold);
+}
+
 static void intel_rps_set_manual(struct intel_rps *rps, bool enable)
 {
struct intel_uncore *uncore = rps_to_uncore(rps);
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.h 
b/drivers/gpu/drm/i915/gt/intel_rps.h
index a3fa987aa91f..92fb01f5a452 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.h
+++ b/drivers/gpu/drm/i915/gt/intel_rps.h
@@ -37,6 +37,10 @@ void intel_rps_mark_interactive(struct intel_rps *rps, bool 
interactive);
 
 int intel_gpu_freq(struct intel_rps *rps, int val);
 int intel_freq_opcode(struct intel_rps *rps, int val);
+u8 intel_rps_get_up_threshold(struct intel_rps *rps);
+int intel_rps_set_up_threshold(struct intel_rps *rps, u8 threshold);
+u8 intel_rps_get_down_threshold(struct intel_rps *rps);
+int intel_rps_set_down_threshold(struct intel_rps *rps, u8 threshold);
 u32 intel_rps_read_actual_frequency(struct intel_rps *rps);
 u32 intel_rps_read_actual_frequency_fw(struct intel_rps *rps);
 u32 intel_rps_get_requested_frequency(struct intel_rps *rps);
-- 
2.39.2



[CI 1/4] drm/i915: Move setting of rps thresholds to init

2023-07-17 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Since 36d516be867c ("drm/i915/gt: Switch to manual evaluation of RPS")
thresholds are invariant so lets move their setting to init time.

Signed-off-by: Tvrtko Ursulin 
Cc: Rodrigo Vivi 
Reviewed-by: Rodrigo Vivi 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c 
b/drivers/gpu/drm/i915/gt/intel_rps.c
index e92e626d4994..20d44549f65e 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -672,7 +672,6 @@ static void rps_set_power(struct intel_rps *rps, int 
new_power)
 {
struct intel_gt *gt = rps_to_gt(rps);
struct intel_uncore *uncore = gt->uncore;
-   u32 threshold_up = 0, threshold_down = 0; /* in % */
u32 ei_up = 0, ei_down = 0;
 
lockdep_assert_held(>power.mutex);
@@ -680,9 +679,6 @@ static void rps_set_power(struct intel_rps *rps, int 
new_power)
if (new_power == rps->power.mode)
return;
 
-   threshold_up = 95;
-   threshold_down = 85;
-
/* Note the units here are not exactly 1us, but 1280ns. */
switch (new_power) {
case LOW_POWER:
@@ -709,17 +705,22 @@ static void rps_set_power(struct intel_rps *rps, int 
new_power)
 
GT_TRACE(gt,
 "changing power mode [%d], up %d%% @ %dus, down %d%% @ %dus\n",
-new_power, threshold_up, ei_up, threshold_down, ei_down);
+new_power,
+rps->power.up_threshold, ei_up,
+rps->power.down_threshold, ei_down);
 
set(uncore, GEN6_RP_UP_EI,
intel_gt_ns_to_pm_interval(gt, ei_up * 1000));
set(uncore, GEN6_RP_UP_THRESHOLD,
-   intel_gt_ns_to_pm_interval(gt, ei_up * threshold_up * 10));
+   intel_gt_ns_to_pm_interval(gt,
+  ei_up * rps->power.up_threshold * 10));
 
set(uncore, GEN6_RP_DOWN_EI,
intel_gt_ns_to_pm_interval(gt, ei_down * 1000));
set(uncore, GEN6_RP_DOWN_THRESHOLD,
-   intel_gt_ns_to_pm_interval(gt, ei_down * threshold_down * 10));
+   intel_gt_ns_to_pm_interval(gt,
+  ei_down *
+  rps->power.down_threshold * 10));
 
set(uncore, GEN6_RP_CONTROL,
(GRAPHICS_VER(gt->i915) > 9 ? 0 : GEN6_RP_MEDIA_TURBO) |
@@ -731,8 +732,6 @@ static void rps_set_power(struct intel_rps *rps, int 
new_power)
 
 skip_hw_write:
rps->power.mode = new_power;
-   rps->power.up_threshold = threshold_up;
-   rps->power.down_threshold = threshold_down;
 }
 
 static void gen6_rps_set_thresholds(struct intel_rps *rps, u8 val)
@@ -1559,10 +1558,12 @@ void intel_rps_enable(struct intel_rps *rps)
return;
 
GT_TRACE(rps_to_gt(rps),
-"min:%x, max:%x, freq:[%d, %d]\n",
+"min:%x, max:%x, freq:[%d, %d], thresholds:[%u, %u]\n",
 rps->min_freq, rps->max_freq,
 intel_gpu_freq(rps, rps->min_freq),
-intel_gpu_freq(rps, rps->max_freq));
+intel_gpu_freq(rps, rps->max_freq),
+rps->power.up_threshold,
+rps->power.down_threshold);
 
GEM_BUG_ON(rps->max_freq < rps->min_freq);
GEM_BUG_ON(rps->idle_freq > rps->max_freq);
@@ -2015,6 +2016,10 @@ void intel_rps_init(struct intel_rps *rps)
}
}
 
+   /* Set default thresholds in % */
+   rps->power.up_threshold = 95;
+   rps->power.down_threshold = 85;
+
/* Finally allow us to boost to max by default */
rps->boost_freq = rps->max_freq;
rps->idle_freq = rps->min_freq;
-- 
2.39.2



[CI 2/4] drm/i915: Record default rps threshold values

2023-07-17 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Record the default values as preparation for exposing the sysfs controls.

Signed-off-by: Tvrtko Ursulin 
Cc: Rodrigo Vivi 
Reviewed-by: Rodrigo Vivi 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/intel_gt_types.h | 3 +++
 drivers/gpu/drm/i915/gt/intel_rps.c  | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h 
b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index f08c2556aa25..1b22d7a50665 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -83,6 +83,9 @@ enum intel_submission_method {
 struct gt_defaults {
u32 min_freq;
u32 max_freq;
+
+   u8 rps_up_threshold;
+   u8 rps_down_threshold;
 };
 
 enum intel_gt_type {
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c 
b/drivers/gpu/drm/i915/gt/intel_rps.c
index 20d44549f65e..69847f919586 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -2018,7 +2018,9 @@ void intel_rps_init(struct intel_rps *rps)
 
/* Set default thresholds in % */
rps->power.up_threshold = 95;
+   rps_to_gt(rps)->defaults.rps_up_threshold = rps->power.up_threshold;
rps->power.down_threshold = 85;
+   rps_to_gt(rps)->defaults.rps_down_threshold = rps->power.down_threshold;
 
/* Finally allow us to boost to max by default */
rps->boost_freq = rps->max_freq;
-- 
2.39.2



<    1   2   3   4   5   6   7   8   9   10   >