[PATCH 4/7] drm/i915: Track page table backing store usage
From: Tvrtko Ursulin Account page table backing store against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 13944a14ea2d..c3f2b379 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; @@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; -- 2.39.2
[PATCH 2/7] drm/i915: Add ability for tracking buffer objects per client
From: Tvrtko Ursulin In order to show per client memory usage lets add some infrastructure which enables tracking buffer objects owned by clients. We add a per client list protected by a new per client lock and to support delayed destruction (post client exit) we make tracked objects hold references to the owning client. Also, object memory region teardown is moved to the existing RCU free callback to allow safe dereference from the fdinfo RCU read section. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +-- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 +++ drivers/gpu/drm/i915/i915_drm_client.c| 36 +++ drivers/gpu/drm/i915/i915_drm_client.h| 32 + 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index c26d87555825..25eeeb863209 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(>mm.link); +#ifdef CONFIG_PROC_FS + INIT_LIST_HEAD(>client_link); +#endif + INIT_LIST_HEAD(>lut_list); spin_lock_init(>lut_lock); @@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head) container_of(head, typeof(*obj), rcu); struct drm_i915_private *i915 = to_i915(obj->base.dev); + /* We need to keep this alive for RCU read access from fdinfo. */ + if (obj->mm.n_placements > 1) + kfree(obj->mm.placements); + i915_gem_object_free(obj); GEM_BUG_ON(!atomic_read(>mm.free_count)); @@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj) if (obj->ops->release) obj->ops->release(obj); - if (obj->mm.n_placements > 1) - kfree(obj->mm.placements); - if (obj->shares_resv_from) i915_vm_resv_put(obj->shares_resv_from); @@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object *gem_obj) GEM_BUG_ON(i915_gem_object_is_framebuffer(obj)); + i915_drm_client_remove_object(obj); + /* * Before we free the object, make sure any pure RCU-only * read-side critical sections are complete, e.g. diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 2292404007c8..0c5cdab278b6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -302,6 +302,18 @@ struct drm_i915_gem_object { */ struct i915_address_space *shares_resv_from; +#ifdef CONFIG_PROC_FS + /** +* @client: @i915_drm_client which created the object +*/ + struct i915_drm_client *client; + + /** +* @client_link: Link into @i915_drm_client.objects_list +*/ + struct list_head client_link; +#endif + union { struct rcu_head rcu; struct llist_node freed; diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2a44b3876cb5..2e5e69edc0f9 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void) kref_init(>kref); spin_lock_init(>ctx_lock); INIT_LIST_HEAD(>ctx_list); +#ifdef CONFIG_PROC_FS + spin_lock_init(>objects_lock); + INIT_LIST_HEAD(>objects_list); +#endif return client; } @@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++) show_client_class(p, i915, file_priv->client, i); } + +void i915_drm_client_add_object(struct i915_drm_client *client, + struct drm_i915_gem_object *obj) +{ + unsigned long flags; + + GEM_WARN_ON(obj->client); + GEM_WARN_ON(!list_empty(>client_link)); + + spin_lock_irqsave(>objects_lock, flags); + obj->client = i915_drm_client_get(client); + list_add_tail_rcu(>client_link, >objects_list); + spin_unlock_irqrestore(>objects_lock, flags); +} + +bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) +{ + struct i915_drm_client *client = fetch_and_zero(>client); + unsigned long flags; + + /* Object may not be associated with a client. */ + if (!client) + return false; + + spin_lock_irqsave(>objects_lock, flags); + list_del_rcu(>client_link); + spin_unlock_irqrestore(>objects_lock, flags); + + i915_drm_client_put(client); +
[PATCH v8 0/7] fdinfo memory stats
From: Tvrtko Ursulin A short series to enable fdinfo memory stats for i915. I added tracking of most classes of objects (user objects, page tables, context state, ring buffers) which contribute to client's memory footprint and am accouting their memory use along the similar lines as in Rob's msm code, just that with i915 specific code we can show a memory region breakdown and so support discrete and multi-tile GPUs properly. And also reflect that our objects can have multiple allowed backing stores. The existing helper Rob added is then used to dump the per memory region stats to fdinfo. The basic objects-per-client infrastructure can later be extended to cover all objects and so avoid needing to walk the IDR under the client's file table lock, which would further avoid distburbing the running clients by parallel fdinfo readers. Example fdinfo format: # cat /proc/1383/fdinfo/8 pos:0 flags: 0212 mnt_id: 21 ino:397 drm-driver: i915 drm-client-id: 18 drm-pdev: :00:02.0 drm-total-system: 125 MiB drm-shared-system: 16 MiB drm-active-system: 110 MiB drm-resident-system:125 MiB drm-purgeable-system: 2 MiB drm-total-stolen-system:0 drm-shared-stolen-system: 0 drm-active-stolen-system: 0 drm-resident-stolen-system: 0 drm-purgeable-stolen-system:0 drm-engine-render: 25662044495 ns drm-engine-copy:0 ns drm-engine-video: 0 ns drm-engine-video-enhance: 0 ns Example gputop output: DRM minor 0 PID SMEM SMEMRSS render copy videoNAME 1233 124M 124M |||||||| neverball 1130 59M 59M |█▌ ||||||| Xorg 1207 12M 12M |||||||| xfwm4 Or with Wayland: DRM minor 0 PID MEM RSSrendercopy videovideo-enhance NAME 2093 191M 191M |▊ || || || | gnome-shell DRM minor 128 PID MEM RSSrendercopy videovideo-enhance NAME 2551 71M 71M |██▉|| || || | neverball 2553 50M 50M | || || || | Xwayland Example intel_gpu_top output, aggregated mode: intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 - 21/ 577 MHz; 71% RC6 8 irqs/s ENGINES BUSY MI_SEMA MI_WAIT Render/3D2.80% |▉ | 0% 0% Blitter0.01% |▏ | 0% 0% Video0.00% | | 0% 0% VideoEnhance0.00% | | 0% 0% PID MEM RSS Render/3D BlitterVideoNAME 50783 109M 107M |▎ ||||||| neverball Region breakdown mode (needs more width for best experience): intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 - 18/ 555 MHz; 65% RC6 8 irqs/s ENGINES BUSY MI_SEMA MI_WAIT Render/3D2.52% |▉ | 0% 0% Blitter0.00% | | 0% 0% Video0.00% | | 0% 0% VideoEnhance0.00% | | 0% 0% PID RAM RSS VRAM VRSS Video NAME 50783 34M 32M 75M 75M |▏ || || || | neverball v2: * Now actually per client. v3: * Track imported dma-buf objects. v4: * Rely on DRM GEM handles for tracking user objects. * Fix internal object accounting (no placements). v5: * Fixed brain fart of overwriting the loop cursor. * Fixed object destruction racing with fdinfo reads. * Take reference to GEM context while using it. v6: * Rebase, cover letter update. v7: * New patch in series for making region names consistent and stable. v8: * New patch in series - stop losing accuracy in drm_file.c::print_size(). Test-with: 20230922134437.234888-1-tvrtko.ursu...@linux.intel.com Tvrtko Ursulin (7): drm: Do not round to megabytes for greater than 1MiB sizes in fdinfo stats drm/i915: Add ability for tracking buffer objects per client drm/i915: Record which client owns a VM drm/i915: Track page table backing store usage drm/i915: Account ring buffer and context state storage drm/i915: Add stable memory region names drm/i915: Implement fdinfo memory stats printing drivers/gpu/drm/drm_file.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 +- .../gpu/drm/i915/gem/i915_gem_context_types.h | 3 + drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 ++- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 ++ .../gpu/drm/i915/gem/selftests/mock_context.c | 4 +- drivers/gpu/drm/i915/gt/intel_context.c | 14 +++ drivers
[PATCH 1/7] drm: Do not round to megabytes for greater than 1MiB sizes in fdinfo stats
From: Tvrtko Ursulin It is better not to lose precision and not revert to 1 MiB size granularity for every size greater than 1 MiB. Sizes in KiB should not be so troublesome to read (and in fact machine parsing is I expect the norm here), they align with other api like /proc/meminfo, and they allow writing tests for the interface without having to embed drm.ko implementation knowledge into them. (Like knowing that minimum buffer size one can use for successful verification has to be 1MiB aligned, and on top account for any pre-existing memory utilisation outside of driver's control.) But probably even more importantly I think that it is just better to show the accurate sizes and not arbitrary lose precision for a little bit of a stretched use case of eyeballing fdinfo text directly. Signed-off-by: Tvrtko Ursulin Cc: Rob Clark Cc: Adrián Larumbe Cc: steven.pr...@arm.com --- drivers/gpu/drm/drm_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index e692770ef6d3..ecb5038009e7 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -913,7 +913,7 @@ static void print_size(struct drm_printer *p, const char *stat, unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { - if (sz < SZ_1K) + if (sz == 0 || !IS_ALIGNED(sz, SZ_1K)) break; sz = div_u64(sz, SZ_1K); } -- 2.39.2
Re: [PATCH 6/6] drm/i915: Implement fdinfo memory stats printing
On 27/09/2023 14:23, Tvrtko Ursulin wrote: On 27/09/2023 07:54, Andi Shyti wrote: Hi Tvrtko, Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. v2: * Only account against the active region. * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas) v3: * Update commit text. (Aravind) * Update to use memory regions uabi names. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark Cc: Andi Shyti Cc: Tejas Upadhyay Reviewed-by: Andi Shyti # v1 Reviewed-by: Aravind Iddamsetty # v2 Reviewed-by: Andi Shyti Thanks guys, just the IGTs remaining now. I've just sent a respin of one patch in that series which will hopefully fix things up. Actually no, I forgot that decided I will respin the i915 series with yet one more patch. Stay tuned please. Regards, Tvrtko * https://patchwork.freedesktop.org/series/124118/ First two patches is what we need to merge the kernel side, while the rest are intel_gpu_top fixes followed by per client memory support. Regards, Tvrtko
Re: [PATCH 6/6] drm/i915: Implement fdinfo memory stats printing
On 27/09/2023 07:54, Andi Shyti wrote: Hi Tvrtko, Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. v2: * Only account against the active region. * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas) v3: * Update commit text. (Aravind) * Update to use memory regions uabi names. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark Cc: Andi Shyti Cc: Tejas Upadhyay Reviewed-by: Andi Shyti # v1 Reviewed-by: Aravind Iddamsetty # v2 Reviewed-by: Andi Shyti Thanks guys, just the IGTs remaining now. I've just sent a respin of one patch in that series which will hopefully fix things up. * https://patchwork.freedesktop.org/series/124118/ First two patches is what we need to merge the kernel side, while the rest are intel_gpu_top fixes followed by per client memory support. Regards, Tvrtko
Re: [PATCH v2] drm/i915: Do not disable preemption for resets
On 26/09/2023 11:26, Andi Shyti wrote: Hi Tvrtko, On Tue, Sep 26, 2023 at 11:08:55AM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a preempt disable section over the hardware reset callback to prepare the driver for being able to reset from atomic contexts. In retrospect I can see that the work item at a time was about removing the struct mutex from the reset path. Code base also briefly entertained the idea of doing the reset under stop_machine in order to serialize userspace mmap and temporary glitch in the fence registers (see eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"), but that never materialized and was soon removed in 2caffbf11762 ("drm/i915: Revoke mmaps and prevent access to fence registers across reset") and replaced with a SRCU based solution. As such, as far as I can see, today we still have a requirement that resets must not sleep (invoked from submission tasklets), but no need to support invoking them from a truly atomic context. Given that the preemption section is problematic on RT kernels, since the uncore lock becomes a sleeping lock and so is invalid in such section, lets try and remove it. Potential downside is that our short waits on GPU to complete the reset may get extended if CPU scheduling interferes, but in practice that probably isn't a deal breaker. In terms of mechanics, since the preemption disabled block is being removed we just need to replace a few of the wait_for_atomic macros into busy looping versions which will work (and not complain) when called from non-atomic sections. v2: * Fix timeouts which are now in us. (Andi) * Update one comment as a drive by. (Andi) Signed-off-by: Tvrtko Ursulin Cc: Chris Wilson Cc: Paul Gortmaker Cc: Sebastian Andrzej Siewior Cc: Andi Shyti Reviewed-by: Andi Shyti Thank you, pushed to drm-intel-gt-next! Regards, Tvrtko
Re: [Intel-gfx] [Patch v1] drm/i915: Add uAPI to query micro-controller FW version
On 27/09/2023 05:14, Balasubrawmanian, Vivaik wrote: Due to a bug in GuC firmware, Mesa can't enable by default the usage of compute engines in DG2 and newer. A new GuC firmware fixed the issue but until now there was no way for Mesa to know if KMD was running with the fixed GuC version or not, so this uAPI is required. Is the firmware bug making the ccs engines generally useless, or just not suitable for this specific Mesa use case? It may be expanded in future to query other firmware versions too. More information: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23661 Mesa usage: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25233 Cc: John Harrison Cc: Daniele Ceraolo Spurio Cc: José Roberto de Souza Signed-off-by: Vivaik Balasubrawmanian --- drivers/gpu/drm/i915/i915_query.c | 47 +++ include/uapi/drm/i915_drm.h | 32 + 2 files changed, 79 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c index 00871ef99792..7f22a49faae7 100644 --- a/drivers/gpu/drm/i915/i915_query.c +++ b/drivers/gpu/drm/i915/i915_query.c @@ -551,6 +551,52 @@ static int query_hwconfig_blob(struct drm_i915_private *i915, return hwconfig->size; } +static int +query_uc_fw_version(struct drm_i915_private *i915, struct drm_i915_query_item *query) +{ + struct drm_i915_query_uc_fw_version __user *query_ptr = u64_to_user_ptr(query->data_ptr); + size_t size = sizeof(struct drm_i915_query_uc_fw_version); + struct drm_i915_query_uc_fw_version resp; + + if (query->length == 0) { + query->length = size; + return 0; + } else if (query->length != size) { + drm_dbg(>drm, + "Invalid uc_fw_version query item size=%u expected=%zu\n", + query->length, size); + return -EINVAL; + } + + if (copy_from_user(, query_ptr, size)) + return -EFAULT; The above can probably be replaced by using the copy_query_item() helper and it would work a bit better even since no reason to reject a buffer too large. + + if (resp.pad || resp.pad2 || resp.reserved) { + drm_dbg(>drm, + "Invalid input fw version query structure parameters received"); + return -EINVAL; + } + + switch (resp.uc_type) { + case I915_QUERY_UC_TYPE_GUC: { + struct intel_guc *guc = >gt0.uc.guc; + + resp.major_ver = guc->submission_version.major; + resp.minor_ver = guc->submission_version.minor; + resp.patch_ver = guc->submission_version.patch; Submission version is not the same as fw version, right? So DRM_I915_QUERY_UC_FW_VERSION and uapi kerneldoc is misleading. Name the query type I915_QUERY_UC_TYPE_GUC*_SUBMISSION* and make it clear? Regards, Tvrtko + resp.branch_ver = 0; + break; + } + default: + return -EINVAL; + } + + if (copy_to_user(query_ptr, , size)) + return -EFAULT; + + return 0; +} + static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, struct drm_i915_query_item *query_item) = { query_topology_info, @@ -559,6 +605,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv, query_memregion_info, query_hwconfig_blob, query_geometry_subslices, + query_uc_fw_version, }; int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 7000e5910a1d..9be241fb77d8 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3013,6 +3013,7 @@ struct drm_i915_query_item { * - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions) * - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`) * - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info) + * - %DRM_I915_QUERY_UC_FW_VERSION (see struct drm_i915_query_uc_fw_version) */ __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 @@ -3021,6 +3022,7 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_MEMORY_REGIONS 4 #define DRM_I915_QUERY_HWCONFIG_BLOB 5 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6 +#define DRM_I915_QUERY_UC_FW_VERSION 7 /* Must be kept compact -- no holes and well documented */ /** @@ -3213,6 +3215,36 @@ struct drm_i915_query_topology_info { __u8 data[]; }; +/** +* struct drm_i915_query_uc_fw_version - query a micro-controller firmware version +* +* Given a uc_type this will return the major, minor, patch and branch version +* of the micro-controller firmware. +*/ +struct drm_i915_query_uc_fw_version { + /** @uc: The micro-controller type to query firmware version */ +#define I915_QUERY_UC_TYPE_GUC 0 + __u16 uc_type; + + /** @pad: MBZ */ + __u16 pad; + + /* @major_ver: major uc fw
Re: [Intel-gfx] [PATCH v4 3/3] drm/i915/gt: Timeout when waiting for idle in suspending
On 26/09/2023 20:05, Alan Previn wrote: When suspending, add a timeout when calling intel_gt_pm_wait_for_idle else if we have a lost G2H event that holds a wakeref (which would be indicative of a bug elsewhere in the driver), driver will at least complete the suspend-resume cycle, (albeit not hitting all the targets for low power hw counters), instead of hanging in the kernel. Signed-off-by: Alan Previn Reviewed-by: Rodrigo Vivi Tested-by: Mousumi Jana --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_gt_pm.c | 6 +- drivers/gpu/drm/i915/gt/intel_gt_pm.h | 7 ++- drivers/gpu/drm/i915/intel_wakeref.c | 14 ++ drivers/gpu/drm/i915/intel_wakeref.h | 6 -- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 84a75c95f3f7..9c6151b78e1d 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -687,7 +687,7 @@ void intel_engines_release(struct intel_gt *gt) if (!engine->release) continue; - intel_wakeref_wait_for_idle(>wakeref); + intel_wakeref_wait_for_idle(>wakeref, 0); GEM_BUG_ON(intel_engine_pm_is_awake(engine)); engine->release(engine); diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c index 59b5658a17fb..820217c06dc7 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -289,6 +289,7 @@ int intel_gt_resume(struct intel_gt *gt) static void wait_for_suspend(struct intel_gt *gt) { + int timeout_ms = CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT ? : 1; CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT is in ns so assigning it to _ms is a bit to arbitrary. Why not the existing I915_GT_SUSPEND_IDLE_TIMEOUT for instance? /* * On rare occasions, we've observed the fence completion trigger * free_engines asynchronously via rcu_call. Ensure those are done. @@ -308,7 +309,10 @@ static void wait_for_suspend(struct intel_gt *gt) intel_gt_retire_requests(gt); } - intel_gt_pm_wait_for_idle(gt); + /* we are suspending, so we shouldn't be waiting forever */ + if (intel_gt_pm_wait_timeout_for_idle(gt, timeout_ms) == -ETIMEDOUT) + gt_warn(gt, "bailing from %s after %d milisec timeout\n", + __func__, timeout_ms); Does the timeout in intel_gt_pm_wait_timeout_for_idle always comes in pair with the timeout first in intel_gt_wait_for_idle? Also, is the timeout here hit from the intel_gt_suspend_prepare, intel_gt_suspend_late, or can be both? Main concern is that we need to be sure there are no possible ill-effects, like letting the GPU/GuC scribble on some memory we unmapped (or will unmap), having let the suspend continue after timing out, and not perhaps doing the forced wedge like wait_for_suspend() does on the existing timeout path. Would it be possible to handle the lost G2H events directly in the respective component instead of here? Like apply the timeout during the step which explicitly idles the CT for suspend (presumably that exists?), and so cleanup from there once declared a lost event. Regards, Tvrtko } void intel_gt_suspend_prepare(struct intel_gt *gt) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h index 6c9a46452364..5358acc2b5b1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h @@ -68,7 +68,12 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt) static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt) { - return intel_wakeref_wait_for_idle(>wakeref); + return intel_wakeref_wait_for_idle(>wakeref, 0); +} + +static inline int intel_gt_pm_wait_timeout_for_idle(struct intel_gt *gt, int timeout_ms) +{ + return intel_wakeref_wait_for_idle(>wakeref, timeout_ms); } void intel_gt_pm_init_early(struct intel_gt *gt); diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c index 718f2f1b6174..383a37521415 100644 --- a/drivers/gpu/drm/i915/intel_wakeref.c +++ b/drivers/gpu/drm/i915/intel_wakeref.c @@ -111,14 +111,20 @@ void __intel_wakeref_init(struct intel_wakeref *wf, "wakeref.work", >work, 0); } -int intel_wakeref_wait_for_idle(struct intel_wakeref *wf) +int intel_wakeref_wait_for_idle(struct intel_wakeref *wf, int timeout_ms) { - int err; + int err = 0; might_sleep(); - err = wait_var_event_killable(>wakeref, - !intel_wakeref_is_active(wf)); + if (!timeout_ms) + err = wait_var_event_killable(>wakeref, + !intel_wakeref_is_active(wf)); + else if
Re: [PATCH 5/6] drm/i915: Add stable memory region names
On 26/09/2023 16:29, Iddamsetty, Aravind wrote: On 22-09-2023 19:16, Tvrtko Ursulin wrote: From: Tvrtko Ursulin At the moment memory region names are a bit too varied and too inconsistent to be used for ABI purposes, like for upcoming fdinfo memory stats. System memory can be either system or system-ttm. Local memory has the instance number appended, others do not. Not only incosistent but thi kind of implementation detail is uninteresting for intended users of fdinfo memory stats. Add a stable name always formed as $type$instance. Could have chosen a different stable scheme, but I think any consistent and stable scheme should do just fine. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_memory_region.c | 19 +++ drivers/gpu/drm/i915/intel_memory_region.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 3d1fdea9811d..60a03340bbd4 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -216,6 +216,22 @@ static int intel_memory_region_memtest(struct intel_memory_region *mem, return err; } +static const char *region_type_str(u16 type) +{ + switch (type) { + case INTEL_MEMORY_SYSTEM: + return "system"; + case INTEL_MEMORY_LOCAL: + return "local"; + case INTEL_MEMORY_STOLEN_LOCAL: + return "stolen-local"; + case INTEL_MEMORY_STOLEN_SYSTEM: + return "stolen-system"; + default: + return "unknown"; + } +} + struct intel_memory_region * intel_memory_region_create(struct drm_i915_private *i915, resource_size_t start, @@ -244,6 +260,9 @@ intel_memory_region_create(struct drm_i915_private *i915, mem->type = type; mem->instance = instance; + snprintf(mem->uabi_name, sizeof(mem->uabi_name), "%s%u", +region_type_str(type), instance); + mutex_init(>objects.lock); INIT_LIST_HEAD(>objects.list); diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h index 2953ed5c3248..9ba36454e51b 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.h +++ b/drivers/gpu/drm/i915/intel_memory_region.h @@ -80,6 +80,7 @@ struct intel_memory_region { u16 instance; enum intel_region_id id; char name[16]; + char uabi_name[16]; Just a thought instead of creating a new field, can't we derive this with name and instance? I'd rather not snprintf on every fdinfo read - for every pid and every drm fd versus 2-3 strings kept around. I did briefly wonder if mr->name could be dropped, that is renamed to mr->uabi_name, but I guess there is some value to print the internal name in some log messages, to leave a trace of what underlying implementation is used. Although I am not too sure about the value of that either since it is implied from the kernel version. Then on top the usage in i915_gem_create/repr_name I could replace with mr->uabi_name and simplify. If there is any value in printing the name there, versus just uabi type:instance integers. Dunno. All I know is fdinfo should have stable names and not confuse with implementation details so I need something.. Regards, Tvrtko
Re: [Intel-gfx] [PATCH] drm/i915: Do not disable preemption for resets
On 26/09/2023 10:18, Andi Shyti wrote: Hi Tvrtko, Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a preempt disable section over the hardware reset callback to prepare the driver for being able to reset from atomic contexts. In retrospect I can see that the work item at a time was about removing the struct mutex from the reset path. Code base also briefly entertained the idea of doing the reset under stop_machine in order to serialize userspace mmap and temporary glitch in the fence registers (see eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"), but that never materialized and was soon removed in 2caffbf11762 ("drm/i915: Revoke mmaps and prevent access to fence registers across reset") and replaced with a SRCU based solution. As such, as far as I can see, today we still have a requirement that resets must not sleep (invoked from submission tasklets), but no need to support invoking them from a truly atomic context. Given that the preemption section is problematic on RT kernels, since the uncore lock becomes a sleeping lock and so is invalid in such section, lets try and remove it. Potential downside is that our short waits on GPU to complete the reset may get extended if CPU scheduling interferes, but in practice that probably isn't a deal breaker. In terms of mechanics, since the preemption disabled block is being removed we just need to replace a few of the wait_for_atomic macros into busy looping versions which will work (and not complain) when called from non-atomic sections. looks reasonable, few unrelated questions --- drivers/gpu/drm/i915/gt/intel_reset.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index e2152f75ba2e..6916eba3bd33 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -167,13 +167,13 @@ static int i915_do_reset(struct intel_gt *gt, /* Assert reset for at least 20 usec, and wait for acknowledgement. */ is this /20/50/ ? Unrelated change but okay. pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); udelay(50); - err = wait_for_atomic(i915_in_reset(pdev), 50); + err = _wait_for_atomic(i915_in_reset(pdev), 50, 0); wait_for_atomic() waits in milliseconds, while _wait_for_atomic() waits in microseconds, I think you need to update the timer. Ah.. well spotted! Do you think we might need a wait_for_atomic_preempt() macro? err = wait_for_atomic_preempt(i915_in_reset(pdev), 50); I don't see what it would do? _wait_for_atomic when ATOMIC == 0 already enables preemption. To allow passing in milliseconds? I fear one more macro would create more confusion. Regards, Tvrtko
[PATCH v2] drm/i915: Do not disable preemption for resets
From: Tvrtko Ursulin Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a preempt disable section over the hardware reset callback to prepare the driver for being able to reset from atomic contexts. In retrospect I can see that the work item at a time was about removing the struct mutex from the reset path. Code base also briefly entertained the idea of doing the reset under stop_machine in order to serialize userspace mmap and temporary glitch in the fence registers (see eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex"), but that never materialized and was soon removed in 2caffbf11762 ("drm/i915: Revoke mmaps and prevent access to fence registers across reset") and replaced with a SRCU based solution. As such, as far as I can see, today we still have a requirement that resets must not sleep (invoked from submission tasklets), but no need to support invoking them from a truly atomic context. Given that the preemption section is problematic on RT kernels, since the uncore lock becomes a sleeping lock and so is invalid in such section, lets try and remove it. Potential downside is that our short waits on GPU to complete the reset may get extended if CPU scheduling interferes, but in practice that probably isn't a deal breaker. In terms of mechanics, since the preemption disabled block is being removed we just need to replace a few of the wait_for_atomic macros into busy looping versions which will work (and not complain) when called from non-atomic sections. v2: * Fix timeouts which are now in us. (Andi) * Update one comment as a drive by. (Andi) Signed-off-by: Tvrtko Ursulin Cc: Chris Wilson Cc: Paul Gortmaker Cc: Sebastian Andrzej Siewior Cc: Andi Shyti --- drivers/gpu/drm/i915/gt/intel_reset.c | 14 ++ 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 98575d79c446..a21e939fdbf6 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -161,16 +161,16 @@ static int i915_do_reset(struct intel_gt *gt, struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); int err; - /* Assert reset for at least 20 usec, and wait for acknowledgement. */ + /* Assert reset for at least 50 usec, and wait for acknowledgement. */ pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); udelay(50); - err = wait_for_atomic(i915_in_reset(pdev), 50); + err = _wait_for_atomic(i915_in_reset(pdev), 5, 0); /* Clear the reset request. */ pci_write_config_byte(pdev, I915_GDRST, 0); udelay(50); if (!err) - err = wait_for_atomic(!i915_in_reset(pdev), 50); + err = _wait_for_atomic(!i915_in_reset(pdev), 5, 0); return err; } @@ -190,7 +190,7 @@ static int g33_do_reset(struct intel_gt *gt, struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); - return wait_for_atomic(g4x_reset_complete(pdev), 50); + return _wait_for_atomic(g4x_reset_complete(pdev), 5, 0); } static int g4x_do_reset(struct intel_gt *gt, @@ -207,7 +207,7 @@ static int g4x_do_reset(struct intel_gt *gt, pci_write_config_byte(pdev, I915_GDRST, GRDOM_MEDIA | GRDOM_RESET_ENABLE); - ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + ret = _wait_for_atomic(g4x_reset_complete(pdev), 5, 0); if (ret) { GT_TRACE(gt, "Wait for media reset failed\n"); goto out; @@ -215,7 +215,7 @@ static int g4x_do_reset(struct intel_gt *gt, pci_write_config_byte(pdev, I915_GDRST, GRDOM_RENDER | GRDOM_RESET_ENABLE); - ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + ret = _wait_for_atomic(g4x_reset_complete(pdev), 5, 0); if (ret) { GT_TRACE(gt, "Wait for render reset failed\n"); goto out; @@ -785,9 +785,7 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) reset_mask = wa_14015076503_start(gt, engine_mask, !retry); GT_TRACE(gt, "engine_mask=%x\n", reset_mask); - preempt_disable(); ret = reset(gt, reset_mask, retry); - preempt_enable(); wa_14015076503_end(gt, reset_mask); } -- 2.39.2
Re: [Intel-gfx] [PATCH] drm/i915: Zap some empty lines
On 25/09/2023 15:14, Andi Shyti wrote: Hi Tvrtko, On Wed, Sep 20, 2023 at 09:57:15AM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Recent refactoring left an unsightly block of empty lines. Remove them. Signed-off-by: Tvrtko Ursulin Cc: Dnyaneshwar Bhadane Cc: Anusha Srivatsa Cc: Radhakrishna Sripada as this isn't merged yet: Reviewed-by: Andi Shyti Thanks, I am catching up with things and this wasn't so important. If you have a spare moment feel free to push it? Regards, Tvrtko
[RFC] drm/i915: Allow dmabuf mmap forwarding
From: Tvrtko Ursulin Allow mmap forwarding for imported buffers in order to allow minigbm mmap to work on aperture-less platforms such as Meteorlake. So far i915 did not allow mmap on imported buffers but from minigbm perspective that worked because of the DRM_IOCTL_I915_GEM_MMAP_GTT fall- back would then be attempted, and would be successful. This stops working on Meteorlake since there is no aperture. Allow i915 to mmap imported buffers using forwarding via dma_buf_mmap(), which allows the primary minigbm path of DRM_IOCTL_I915_GEM_MMAP_OFFSET / I915_MMAP_OFFSET_WB to work. Signed-off-by: Tvrtko Ursulin Cc: Daniel Vetter Cc: Christian König Cc: Matthew Auld Cc: Nirmoy Das --- 1) It is unclear to me if any real userspace depends on this, but there are certainly compliance suites which fail. 2) It is also a bit unclear to me if dma_buf_mmap() is exactly intended for this kind of use. It seems that it is, but I also found some old mailing list discussions suggesting there might be some unresolved questions around VMA revocation. 1 + 2 = RFC for now. Daniel and Christian were involved in 2) in the past so comments would be appreciated. Test-with: 20230925131539.32743-1-tvrtko.ursu...@linux.intel.com --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 78 +++ .../gpu/drm/i915/gem/i915_gem_object_types.h | 1 + 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index aa4d842d4c5a..78c84c0a8b08 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -664,6 +665,7 @@ insert_mmo(struct drm_i915_gem_object *obj, struct i915_mmap_offset *mmo) static struct i915_mmap_offset * mmap_offset_attach(struct drm_i915_gem_object *obj, enum i915_mmap_type mmap_type, + bool forward_mmap, struct drm_file *file) { struct drm_i915_private *i915 = to_i915(obj->base.dev); @@ -682,6 +684,7 @@ mmap_offset_attach(struct drm_i915_gem_object *obj, mmo->obj = obj; mmo->mmap_type = mmap_type; + mmo->forward_mmap = forward_mmap; drm_vma_node_reset(>vma_node); err = drm_vma_offset_add(obj->base.dev->vma_offset_manager, @@ -714,12 +717,25 @@ mmap_offset_attach(struct drm_i915_gem_object *obj, return ERR_PTR(err); } +static bool +should_forward_mmap(struct drm_i915_gem_object *obj, + enum i915_mmap_type mmap_type) +{ + if (!obj->base.import_attach) + return false; + + return mmap_type == I915_MMAP_TYPE_WB || + mmap_type == I915_MMAP_TYPE_WC || + mmap_type == I915_MMAP_TYPE_UC; +} + static int __assign_mmap_offset(struct drm_i915_gem_object *obj, enum i915_mmap_type mmap_type, u64 *offset, struct drm_file *file) { struct i915_mmap_offset *mmo; + bool should_forward; if (i915_gem_object_never_mmap(obj)) return -ENODEV; @@ -735,12 +751,15 @@ __assign_mmap_offset(struct drm_i915_gem_object *obj, if (mmap_type == I915_MMAP_TYPE_FIXED) return -ENODEV; + should_forward = should_forward_mmap(obj, mmap_type); + if (mmap_type != I915_MMAP_TYPE_GTT && !i915_gem_object_has_struct_page(obj) && - !i915_gem_object_has_iomem(obj)) + !i915_gem_object_has_iomem(obj) && + !should_forward) return -ENODEV; - mmo = mmap_offset_attach(obj, mmap_type, file); + mmo = mmap_offset_attach(obj, mmap_type, should_forward, file); if (IS_ERR(mmo)) return PTR_ERR(mmo); @@ -936,6 +955,32 @@ static struct file *mmap_singleton(struct drm_i915_private *i915) return file; } +static void +__vma_mmap_pgprot(struct vm_area_struct *vma, enum i915_mmap_type mmap_type) +{ + const pgprot_t pgprot =vm_get_page_prot(vma->vm_flags); + + switch (mmap_type) { + case I915_MMAP_TYPE_WC: + vma->vm_page_prot = pgprot_writecombine(pgprot); + break; + case I915_MMAP_TYPE_FIXED: + GEM_WARN_ON(1); + fallthrough; + case I915_MMAP_TYPE_WB: + vma->vm_page_prot = pgprot; + break; + case I915_MMAP_TYPE_UC: + vma->vm_page_prot = pgprot_noncached(pgprot); + break; + case I915_MMAP_TYPE_GTT: + vma->vm_page_prot = pgprot_writecombine(pgprot); + break; + } + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); +} + static int i915_gem_object_mmap(struct drm_i915_gem_object *obj, struct i915_mmap_offset *mmo, @@ -953,6 +998,20 @@ i915_gem_object_mma
Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics
On 22/09/2023 16:23, Steven Price wrote: On 22/09/2023 14:53, Tvrtko Ursulin wrote: On 22/09/2023 11:57, Adrián Larumbe wrote: On 20.09.2023 16:40, Tvrtko Ursulin wrote: On 20/09/2023 00:34, Adrián Larumbe wrote: The drm-stats fdinfo tags made available to user space are drm-engine, drm-cycles, drm-max-freq and drm-curfreq, one per job slot. This deviates from standard practice in other DRM drivers, where a single set of key:value pairs is provided for the whole render engine. However, Panfrost has separate queues for fragment and vertex/tiler jobs, so a decision was made to calculate bus cycles and workload times separately. Maximum operating frequency is calculated at devfreq initialisation time. Current frequency is made available to user space because nvtop uses it when performing engine usage calculations. It is important to bear in mind that both GPU cycle and kernel time numbers provided are at best rough estimations, and always reported in excess from the actual figure because of two reasons: - Excess time because of the delay between the end of a job processing, the subsequent job IRQ and the actual time of the sample. - Time spent in the engine queue waiting for the GPU to pick up the next job. To avoid race conditions during enablement/disabling, a reference counting mechanism was introduced, and a job flag that tells us whether a given job increased the refcount. This is necessary, because user space can toggle cycle counting through a debugfs file, and a given job might have been in flight by the time cycle counting was disabled. The main goal of the debugfs cycle counter knob is letting tools like nvtop or IGT's gputop switch it at any time, to avoid power waste in case no engine usage measuring is necessary. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/panfrost/Makefile | 2 + drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 + drivers/gpu/drm/panfrost/panfrost_devfreq.c | 8 +++ drivers/gpu/drm/panfrost/panfrost_devfreq.h | 3 ++ drivers/gpu/drm/panfrost/panfrost_device.c | 2 + drivers/gpu/drm/panfrost/panfrost_device.h | 13 + drivers/gpu/drm/panfrost/panfrost_drv.c | 57 - drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++ drivers/gpu/drm/panfrost/panfrost_gpu.h | 4 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 24 + drivers/gpu/drm/panfrost/panfrost_job.h | 5 ++ 12 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile index 7da2b3f02ed9..2c01c1e7523e 100644 --- a/drivers/gpu/drm/panfrost/Makefile +++ b/drivers/gpu/drm/panfrost/Makefile @@ -12,4 +12,6 @@ panfrost-y := \ panfrost_perfcnt.o \ panfrost_dump.o +panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o + obj-$(CONFIG_DRM_PANFROST) += panfrost.o diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c b/drivers/gpu/drm/panfrost/panfrost_debugfs.c new file mode 100644 index ..cc14eccba206 --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2023 Collabora ltd. */ + +#include +#include +#include +#include +#include + +#include "panfrost_device.h" +#include "panfrost_gpu.h" +#include "panfrost_debugfs.h" + +void panfrost_debugfs_init(struct drm_minor *minor) +{ + struct drm_device *dev = minor->dev; + struct panfrost_device *pfdev = platform_get_drvdata(to_platform_device(dev->dev)); + + debugfs_create_atomic_t("profile", 0600, minor->debugfs_root, >profile_mode); +} diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h b/drivers/gpu/drm/panfrost/panfrost_debugfs.h new file mode 100644 index ..db1c158bcf2f --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2023 Collabora ltd. + */ + +#ifndef PANFROST_DEBUGFS_H +#define PANFROST_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +void panfrost_debugfs_init(struct drm_minor *minor); +#endif + +#endif /* PANFROST_DEBUGFS_H */ diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c index 58dfb15a8757..28caffc689e2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c @@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct device *dev, spin_lock_irqsave(>lock, irqflags); panfrost_devfreq_update_utilization(pfdevfreq); + pfdevfreq->current_frequency = status->current_frequency; st
Re: [Intel-gfx] [PATCH 2/3] drm/i915/mtl: Add a PMU counter for total active ticks
On 22/09/2023 23:25, john.c.harri...@intel.com wrote: From: Umesh Nerlige Ramappa Current engine busyness interface exposed by GuC has a few issues: - The busyness of active engine is calculated using 2 values provided by GuC and is prone to race between CPU reading those values and GuC updating them. Any sort of HW synchronization would be at the cost of scheduling latencies. - GuC provides only 32 bit values for busyness and KMD has to run a worker to extend the values to 64 bit. In addition KMD also needs to extend the GT timestamp to 64 bits so that it can be used to calculate active busyness for an engine. To address these issues, GuC provides a new interface to calculate engine busyness. GuC accumulates the busyness ticks in a 64 bit value and also internally updates the busyness for an active context using a periodic timer. This simplifies the KMD implementation such that KMD only needs to relay the busyness value to the user. In addition to fixing the interface, GuC also provides a periodically total active ticks that the GT has been running for. This counter is exposed to the user so that the % busyness can be calculated as follows: busyness % = (engine active ticks/total active ticks) * 100. AFAIU I915_PMU_TOTAL_ACTIVE_TICKS only runs when GT is awake, right? So if GT is awake 10% of the time, and engine is busy that 100% of that time, which is 10% of the real/wall time, the busyness by this formula comes up as 100%. Which wouldn't be useful for intel_gpu_top and alike. How to scale it back to wall time? Again AFAIU there is no info about tick frequency, so how does one know what a delta in total active ticks means? Going back on the higher level, I am not convinced we need to add a new uapi just for MTL. If the tick period is known internally we could just use v2 internally and expose the current uapi using it. Any timebase conversion error is unlikely to be relevant because userspace only looks at deltas over relatively short periods (seconds). Ie. I don't think that the clock drift error would accumulate so it would need to be really huge to be relevant over short sampling periods. Regards, Tvrtko Implement the new interface and start by adding a new counter for total active ticks. Signed-off-by: Umesh Nerlige Ramappa Signed-off-by: John Harrison --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 24 +++ .../gpu/drm/i915/gt/uc/intel_guc_submission.h | 1 + drivers/gpu/drm/i915/i915_pmu.c | 6 + include/uapi/drm/i915_drm.h | 2 ++ 4 files changed, 33 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 88465d701c278..0c1fee5360777 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1607,6 +1607,30 @@ static ktime_t busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime return ns_to_ktime(total); } +static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc) +{ + return guc->busy.v1.gt_stamp; +} + +static u64 busy_v2_intel_guc_total_active_ticks(struct intel_guc *guc) +{ + u64 ticks_gt; + + __busy_v2_get_engine_usage_record(guc, NULL, NULL, NULL, _gt); + + return ticks_gt; +} + +u64 intel_guc_total_active_ticks(struct intel_gt *gt) +{ + struct intel_guc *guc = >uc.guc; + + if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 3, 1)) + return busy_v1_intel_guc_total_active_ticks(guc); + else + return busy_v2_intel_guc_total_active_ticks(guc); +} + static int busy_v2_guc_action_enable_usage_stats_device(struct intel_guc *guc) { u32 offset = guc_engine_usage_offset_v2_device(guc); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h index c57b29cdb1a64..f6d42838825f2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h @@ -30,6 +30,7 @@ void intel_guc_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m); void intel_guc_busyness_park(struct intel_gt *gt); void intel_guc_busyness_unpark(struct intel_gt *gt); +u64 intel_guc_total_active_ticks(struct intel_gt *gt); bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve); diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index d35973b411863..4f52636eb4a80 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -563,6 +563,8 @@ config_status(struct drm_i915_private *i915, u64 config) break; case I915_PMU_SOFTWARE_GT_AWAKE_TIME: break; + case I915_PMU_TOTAL_ACTIVE_TICKS: + break; default: return -ENOENT; } @@
Re: [Intel-gfx] [PATCH 3/3] drm/i915/mtl: Add counters for engine busyness ticks
On 22/09/2023 23:25, john.c.harri...@intel.com wrote: From: Umesh Nerlige Ramappa In new version of GuC engine busyness, GuC provides engine busyness ticks as a 64 bit counter. Add a new counter to relay this value to the user as is. Signed-off-by: Umesh Nerlige Ramappa Signed-off-by: John Harrison --- drivers/gpu/drm/i915/gt/intel_engine.h| 1 + drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 + drivers/gpu/drm/i915/gt/intel_engine_types.h | 12 drivers/gpu/drm/i915/gt/intel_engine_user.c | 1 + .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++- drivers/gpu/drm/i915/i915_pmu.c | 25 ++- drivers/gpu/drm/i915/i915_pmu.h | 2 +- include/uapi/drm/i915_drm.h | 13 +++- 8 files changed, 116 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index b58c30ac8ef02..57af7ec8ecd82 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct list_head *requests, ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now); +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine); void intel_engine_get_hung_entity(struct intel_engine_cs *engine, struct intel_context **ce, struct i915_request **rq); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 84a75c95f3f7d..1c9ffb1ae9889 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now) return engine->busyness(engine, now); } +/** + * intel_engine_get_busy_ticks() - Return current accumulated engine busyness + * ticks + * @engine: engine to report on + * + * Returns accumulated ticks @engine was busy since engine stats were enabled. + */ +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine) +{ + if (!engine->busyness_ticks || + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS)) + return 0; + + return engine->busyness_ticks(engine); +} + struct intel_context * intel_engine_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 40fd8f984d64b..a88d40c74d604 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -548,6 +548,11 @@ struct intel_engine_cs { ktime_t (*busyness)(struct intel_engine_cs *engine, ktime_t *now); + /* +* Get engine busyness ticks +*/ + u64 (*busyness_ticks)(struct intel_engine_cs *engine); + struct intel_engine_execlists execlists; /* @@ -574,6 +579,7 @@ struct intel_engine_cs { #define I915_ENGINE_HAS_EU_PRIORITYBIT(10) #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11) #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12) +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13) unsigned int flags; /* @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_SUPPORTS_STATS; } +static inline bool +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS; +} + static inline bool intel_engine_has_preemption(const struct intel_engine_cs *engine) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c index dcedff41a825f..69eb610b5ab0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct drm_i915_private *i915) MAP(HAS_PREEMPTION, PREEMPTION), MAP(HAS_SEMAPHORES, SEMAPHORES), MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS), + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS), #undef MAP }; struct intel_engine_cs *engine; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0c1fee5360777..71749fb9ad35b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1289,12 +1289,7 @@ static void busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now) guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo; } -/* - * Unlike the execlist mode of submission total and active times are in terms
Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats
On 22/09/2023 12:03, Adrián Larumbe wrote: On 21.09.2023 11:14, Tvrtko Ursulin wrote: On 20/09/2023 16:32, Tvrtko Ursulin wrote: On 20/09/2023 00:34, Adrián Larumbe wrote: The current implementation will try to pick the highest available size display unit as soon as the BO size exceeds that of the previous multiplier. That can lead to loss of precision in contexts of low memory usage. The new selection criteria try to preserve precision, whilst also increasing the display unit selection threshold to render more accurate values. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/drm_file.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 762965e3d503..34cfa128ffe5 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) } EXPORT_SYMBOL(drm_send_event); +#define UPPER_UNIT_THRESHOLD 100 + static void print_size(struct drm_printer *p, const char *stat, const char *region, u64 sz) { @@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p, const char *stat, unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { - if (sz < SZ_1K) + if ((sz & (SZ_1K - 1)) && IS_ALIGNED worth it at all? + sz < UPPER_UNIT_THRESHOLD * SZ_1K) break; Excuse me for a late comment (I was away). I did not get what what is special about a ~10% threshold? Sounds to me just going with the lower unit, when size is not aligned to the higher one, would be better than sometimes precision-sometimes-not. FWIW both current and the threshold option make testing the feature very annoying. How so? I have to build in the knowledge of implementation details of print_size() into my IGT in order to use the right size BOs, so test is able to verify stats move as expected. It just feels wrong. So I'd really propose we simply use smaller unit when unaligned. Like I said in the previous reply, for drm files whose overall BO size sum is enormous but not a multiple of a MiB, this would render huge number representations in KiB. I don't find this particularly comfortable to read, and then this extra precision would mean nothing to nvtop or gputop, which would have to scale the size to their available screen dimensions when plotting them. I don't think numbers in KiB are so huge. And I don't think people will end up reading them manually a lot anyway, since you have to hunt the pid, and fd, etc.. It is much more realistic that some tool like gputop will be used. And I don't think consistency of units across drivers or whatever matters. Even better to keep userspace parser on their toes and make then follow drm-usage-stats.rst and not any implementations, at some point in time. Regards, Tvrtko
Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics
On 22/09/2023 11:57, Adrián Larumbe wrote: On 20.09.2023 16:40, Tvrtko Ursulin wrote: On 20/09/2023 00:34, Adrián Larumbe wrote: The drm-stats fdinfo tags made available to user space are drm-engine, drm-cycles, drm-max-freq and drm-curfreq, one per job slot. This deviates from standard practice in other DRM drivers, where a single set of key:value pairs is provided for the whole render engine. However, Panfrost has separate queues for fragment and vertex/tiler jobs, so a decision was made to calculate bus cycles and workload times separately. Maximum operating frequency is calculated at devfreq initialisation time. Current frequency is made available to user space because nvtop uses it when performing engine usage calculations. It is important to bear in mind that both GPU cycle and kernel time numbers provided are at best rough estimations, and always reported in excess from the actual figure because of two reasons: - Excess time because of the delay between the end of a job processing, the subsequent job IRQ and the actual time of the sample. - Time spent in the engine queue waiting for the GPU to pick up the next job. To avoid race conditions during enablement/disabling, a reference counting mechanism was introduced, and a job flag that tells us whether a given job increased the refcount. This is necessary, because user space can toggle cycle counting through a debugfs file, and a given job might have been in flight by the time cycle counting was disabled. The main goal of the debugfs cycle counter knob is letting tools like nvtop or IGT's gputop switch it at any time, to avoid power waste in case no engine usage measuring is necessary. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/panfrost/Makefile | 2 + drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 + drivers/gpu/drm/panfrost/panfrost_devfreq.c | 8 +++ drivers/gpu/drm/panfrost/panfrost_devfreq.h | 3 ++ drivers/gpu/drm/panfrost/panfrost_device.c | 2 + drivers/gpu/drm/panfrost/panfrost_device.h | 13 + drivers/gpu/drm/panfrost/panfrost_drv.c | 57 - drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++ drivers/gpu/drm/panfrost/panfrost_gpu.h | 4 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 24 + drivers/gpu/drm/panfrost/panfrost_job.h | 5 ++ 12 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile index 7da2b3f02ed9..2c01c1e7523e 100644 --- a/drivers/gpu/drm/panfrost/Makefile +++ b/drivers/gpu/drm/panfrost/Makefile @@ -12,4 +12,6 @@ panfrost-y := \ panfrost_perfcnt.o \ panfrost_dump.o +panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o + obj-$(CONFIG_DRM_PANFROST) += panfrost.o diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c b/drivers/gpu/drm/panfrost/panfrost_debugfs.c new file mode 100644 index ..cc14eccba206 --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2023 Collabora ltd. */ + +#include +#include +#include +#include +#include + +#include "panfrost_device.h" +#include "panfrost_gpu.h" +#include "panfrost_debugfs.h" + +void panfrost_debugfs_init(struct drm_minor *minor) +{ + struct drm_device *dev = minor->dev; + struct panfrost_device *pfdev = platform_get_drvdata(to_platform_device(dev->dev)); + + debugfs_create_atomic_t("profile", 0600, minor->debugfs_root, >profile_mode); +} diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h b/drivers/gpu/drm/panfrost/panfrost_debugfs.h new file mode 100644 index ..db1c158bcf2f --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2023 Collabora ltd. + */ + +#ifndef PANFROST_DEBUGFS_H +#define PANFROST_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +void panfrost_debugfs_init(struct drm_minor *minor); +#endif + +#endif /* PANFROST_DEBUGFS_H */ diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c index 58dfb15a8757..28caffc689e2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c @@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct device *dev, spin_lock_irqsave(>lock, irqflags); panfrost_devfreq_update_utilization(pfdevfreq); + pfdevfreq->current_frequency = status->current_frequency; status->total_time = ktime_to_ns(ktime_add(pfdevfreq->busy_time,
[PATCH 6/6] drm/i915: Implement fdinfo memory stats printing
From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. v2: * Only account against the active region. * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas) v3: * Update commit text. (Aravind) * Update to use memory regions uabi names. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark Cc: Andi Shyti Cc: Tejas Upadhyay Reviewed-by: Andi Shyti # v1 Reviewed-by: Aravind Iddamsetty # v2 --- drivers/gpu/drm/i915/i915_drm_client.c | 64 ++ 1 file changed, 64 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index a61356012df8..7efffdaa508d 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) +{ + const enum intel_region_id id = obj->mm.region ? + obj->mm.region->id : INTEL_REGION_SMEM; + const u64 sz = obj->base.size; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + DMA_RESV_USAGE_BOOKKEEP)) + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry(>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, >objects_list) { + obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj), +client_link)); + if (!obj) + continue; + obj_meminfo(obj, stats); + i915_gem_object_put(obj); + } + rcu_read_unlock(); + + for_each_memory_region(mr, i915, id) + drm_print_memory_stats(p, + [id], + DRM_GEM_OBJECT_RESIDENT | + DRM_GEM_OBJECT_PURGEABLE, + mr->uabi_name); +} + static const char * const uabi_class_names[] = { [I915_ENGINE_CLASS_RENDER] = "render", [I915_ENGINE_CLASS_COPY] = "copy", @@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) * ** */ + show_meminfo(p, file); + if (GRAPHICS_VER(i915) < 8) return; -- 2.39.2
[PATCH 2/6] drm/i915: Record which client owns a VM
From: Tvrtko Ursulin To enable accounting of indirect client memory usage (such as page tables) in the following patch, lets start recording the creator of each PPGTT. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 --- drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 3 +++ drivers/gpu/drm/i915/gem/selftests/mock_context.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 9a9ff84c90d7..35cf6608180e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -279,7 +279,8 @@ static int proto_context_set_protected(struct drm_i915_private *i915, } static struct i915_gem_proto_context * -proto_context_create(struct drm_i915_private *i915, unsigned int flags) +proto_context_create(struct drm_i915_file_private *fpriv, +struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_proto_context *pc, *err; @@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, unsigned int flags) if (!pc) return ERR_PTR(-ENOMEM); + pc->fpriv = fpriv; pc->num_user_engines = -1; pc->user_engines = NULL; pc->user_flags = BIT(UCONTEXT_BANNABLE) | @@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915, err = PTR_ERR(ppgtt); goto err_ctx; } + ppgtt->vm.fpriv = pc->fpriv; vm = >vm; } if (vm) @@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915, /* 0 reserved for invalid/unassigned ppgtt */ xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1); - pc = proto_context_create(i915, 0); + pc = proto_context_create(file_priv, i915, 0); if (IS_ERR(pc)) { err = PTR_ERR(pc); goto err; @@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data, GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */ args->vm_id = id; + ppgtt->vm.fpriv = file_priv; return 0; err_put: @@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, return -EIO; } - ext_data.pc = proto_context_create(i915, args->flags); + ext_data.pc = proto_context_create(file->driver_priv, i915, + args->flags); if (IS_ERR(ext_data.pc)) return PTR_ERR(ext_data.pc); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index cb78214a7dcd..c573c067779f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -188,6 +188,9 @@ struct i915_gem_proto_engine { * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE. */ struct i915_gem_proto_context { + /** @fpriv: Client which creates the context */ + struct drm_i915_file_private *fpriv; + /** @vm: See _gem_context.vm */ struct i915_address_space *vm; diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c b/drivers/gpu/drm/i915/gem/selftests/mock_context.c index 8ac6726ec16b..125584ada282 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c @@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file) int err; u32 id; - pc = proto_context_create(i915, 0); + pc = proto_context_create(fpriv, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); @@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915, struct i915_gem_context *ctx; struct i915_gem_proto_context *pc; - pc = proto_context_create(i915, 0); + pc = proto_context_create(NULL, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 346ec8ec2edd..8cf62f5134a9 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -248,6 +248,7 @@ struct i915_address_space { struct drm_mm mm; struct intel_gt *gt; struct drm_i915_private *i915; + struct drm_i915_file_private *fpriv; struct device *dma; u64 total; /* size addr space maps (ex. 2GB for ggtt) */ u64 reserved; /* size addr space reserved */ -- 2.39.2
[PATCH 5/6] drm/i915: Add stable memory region names
From: Tvrtko Ursulin At the moment memory region names are a bit too varied and too inconsistent to be used for ABI purposes, like for upcoming fdinfo memory stats. System memory can be either system or system-ttm. Local memory has the instance number appended, others do not. Not only incosistent but thi kind of implementation detail is uninteresting for intended users of fdinfo memory stats. Add a stable name always formed as $type$instance. Could have chosen a different stable scheme, but I think any consistent and stable scheme should do just fine. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/intel_memory_region.c | 19 +++ drivers/gpu/drm/i915/intel_memory_region.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 3d1fdea9811d..60a03340bbd4 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -216,6 +216,22 @@ static int intel_memory_region_memtest(struct intel_memory_region *mem, return err; } +static const char *region_type_str(u16 type) +{ + switch (type) { + case INTEL_MEMORY_SYSTEM: + return "system"; + case INTEL_MEMORY_LOCAL: + return "local"; + case INTEL_MEMORY_STOLEN_LOCAL: + return "stolen-local"; + case INTEL_MEMORY_STOLEN_SYSTEM: + return "stolen-system"; + default: + return "unknown"; + } +} + struct intel_memory_region * intel_memory_region_create(struct drm_i915_private *i915, resource_size_t start, @@ -244,6 +260,9 @@ intel_memory_region_create(struct drm_i915_private *i915, mem->type = type; mem->instance = instance; + snprintf(mem->uabi_name, sizeof(mem->uabi_name), "%s%u", +region_type_str(type), instance); + mutex_init(>objects.lock); INIT_LIST_HEAD(>objects.list); diff --git a/drivers/gpu/drm/i915/intel_memory_region.h b/drivers/gpu/drm/i915/intel_memory_region.h index 2953ed5c3248..9ba36454e51b 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.h +++ b/drivers/gpu/drm/i915/intel_memory_region.h @@ -80,6 +80,7 @@ struct intel_memory_region { u16 instance; enum intel_region_id id; char name[16]; + char uabi_name[16]; bool private; /* not for userspace */ struct { -- 2.39.2
[PATCH 3/6] drm/i915: Track page table backing store usage
From: Tvrtko Ursulin Account page table backing store against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 13944a14ea2d..c3f2b379 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; @@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; -- 2.39.2
[PATCH 1/6] drm/i915: Add ability for tracking buffer objects per client
From: Tvrtko Ursulin In order to show per client memory usage lets add some infrastructure which enables tracking buffer objects owned by clients. We add a per client list protected by a new per client lock and to support delayed destruction (post client exit) we make tracked objects hold references to the owning client. Also, object memory region teardown is moved to the existing RCU free callback to allow safe dereference from the fdinfo RCU read section. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +-- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 +++ drivers/gpu/drm/i915/i915_drm_client.c| 36 +++ drivers/gpu/drm/i915/i915_drm_client.h| 32 + 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index c26d87555825..25eeeb863209 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(>mm.link); +#ifdef CONFIG_PROC_FS + INIT_LIST_HEAD(>client_link); +#endif + INIT_LIST_HEAD(>lut_list); spin_lock_init(>lut_lock); @@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head) container_of(head, typeof(*obj), rcu); struct drm_i915_private *i915 = to_i915(obj->base.dev); + /* We need to keep this alive for RCU read access from fdinfo. */ + if (obj->mm.n_placements > 1) + kfree(obj->mm.placements); + i915_gem_object_free(obj); GEM_BUG_ON(!atomic_read(>mm.free_count)); @@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj) if (obj->ops->release) obj->ops->release(obj); - if (obj->mm.n_placements > 1) - kfree(obj->mm.placements); - if (obj->shares_resv_from) i915_vm_resv_put(obj->shares_resv_from); @@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object *gem_obj) GEM_BUG_ON(i915_gem_object_is_framebuffer(obj)); + i915_drm_client_remove_object(obj); + /* * Before we free the object, make sure any pure RCU-only * read-side critical sections are complete, e.g. diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 2292404007c8..0c5cdab278b6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -302,6 +302,18 @@ struct drm_i915_gem_object { */ struct i915_address_space *shares_resv_from; +#ifdef CONFIG_PROC_FS + /** +* @client: @i915_drm_client which created the object +*/ + struct i915_drm_client *client; + + /** +* @client_link: Link into @i915_drm_client.objects_list +*/ + struct list_head client_link; +#endif + union { struct rcu_head rcu; struct llist_node freed; diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2a44b3876cb5..2e5e69edc0f9 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void) kref_init(>kref); spin_lock_init(>ctx_lock); INIT_LIST_HEAD(>ctx_list); +#ifdef CONFIG_PROC_FS + spin_lock_init(>objects_lock); + INIT_LIST_HEAD(>objects_list); +#endif return client; } @@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++) show_client_class(p, i915, file_priv->client, i); } + +void i915_drm_client_add_object(struct i915_drm_client *client, + struct drm_i915_gem_object *obj) +{ + unsigned long flags; + + GEM_WARN_ON(obj->client); + GEM_WARN_ON(!list_empty(>client_link)); + + spin_lock_irqsave(>objects_lock, flags); + obj->client = i915_drm_client_get(client); + list_add_tail_rcu(>client_link, >objects_list); + spin_unlock_irqrestore(>objects_lock, flags); +} + +bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) +{ + struct i915_drm_client *client = fetch_and_zero(>client); + unsigned long flags; + + /* Object may not be associated with a client. */ + if (!client) + return false; + + spin_lock_irqsave(>objects_lock, flags); + list_del_rcu(>client_link); + spin_unlock_irqrestore(>objects_lock, flags); + + i915_drm_client_put(client); +
[PATCH 4/6] drm/i915: Account ring buffer and context state storage
From: Tvrtko Ursulin Account ring buffers and logical context space against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_context.c | 14 ++ drivers/gpu/drm/i915/i915_drm_client.c | 10 ++ drivers/gpu/drm/i915/i915_drm_client.h | 9 + 3 files changed, 33 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index a53b26178f0a..a2f1245741bb 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -6,6 +6,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "i915_drm_client.h" #include "i915_drv.h" #include "i915_trace.h" @@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine) int intel_context_alloc_state(struct intel_context *ce) { + struct i915_gem_context *ctx; int err = 0; if (mutex_lock_interruptible(>pin_mutex)) @@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce) goto unlock; set_bit(CONTEXT_ALLOC_BIT, >flags); + + rcu_read_lock(); + ctx = rcu_dereference(ce->gem_context); + if (ctx && !kref_get_unless_zero(>ref)) + ctx = NULL; + rcu_read_unlock(); + if (ctx) { + if (ctx->client) + i915_drm_client_add_context_objects(ctx->client, + ce); + i915_gem_context_put(ctx); + } } unlock: diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2e5e69edc0f9..a61356012df8 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) return true; } + +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce) +{ + if (ce->state) + i915_drm_client_add_object(client, ce->state->obj); + + if (ce->ring != ce->engine->legacy.ring && ce->ring->vma) + i915_drm_client_add_object(client, ce->ring->vma->obj); +} #endif diff --git a/drivers/gpu/drm/i915/i915_drm_client.h b/drivers/gpu/drm/i915/i915_drm_client.h index 5f58fdf7dcb8..69cedfcd3d69 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.h +++ b/drivers/gpu/drm/i915/i915_drm_client.h @@ -14,6 +14,7 @@ #include "i915_file_private.h" #include "gem/i915_gem_object_types.h" +#include "gt/intel_context_types.h" #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE @@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file); void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj); bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj); +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce); #else static inline void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj) @@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct i915_drm_client *client, static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) { } + +static inline void +i915_drm_client_add_context_objects(struct i915_drm_client *client, + struct intel_context *ce) +{ +} #endif #endif /* !__I915_DRM_CLIENT_H__ */ -- 2.39.2
[PATCH v7 0/6] fdinfo memory stats
From: Tvrtko Ursulin A short series to enable fdinfo memory stats for i915. I added tracking of most classes of objects (user objects, page tables, context state, ring buffers) which contribute to client's memory footprint and am accouting their memory use along the similar lines as in Rob's msm code, just that with i915 specific code we can show a memory region breakdown and so support discrete and multi-tile GPUs properly. And also reflect that our objects can have multiple allowed backing stores. The existing helper Rob added is then used to dump the per memory region stats to fdinfo. The basic objects-per-client infrastructure can later be extended to cover all objects and so avoid needing to walk the IDR under the client's file table lock, which would further avoid distburbing the running clients by parallel fdinfo readers. Example fdinfo format: # cat /proc/1383/fdinfo/8 pos:0 flags: 0212 mnt_id: 21 ino:397 drm-driver: i915 drm-client-id: 18 drm-pdev: :00:02.0 drm-total-system: 125 MiB drm-shared-system: 16 MiB drm-active-system: 110 MiB drm-resident-system:125 MiB drm-purgeable-system: 2 MiB drm-total-stolen-system:0 drm-shared-stolen-system: 0 drm-active-stolen-system: 0 drm-resident-stolen-system: 0 drm-purgeable-stolen-system:0 drm-engine-render: 25662044495 ns drm-engine-copy:0 ns drm-engine-video: 0 ns drm-engine-video-enhance: 0 ns Example gputop output: DRM minor 0 PID SMEM SMEMRSS render copy videoNAME 1233 124M 124M |||||||| neverball 1130 59M 59M |█▌ ||||||| Xorg 1207 12M 12M |||||||| xfwm4 Or with Wayland: DRM minor 0 PID MEM RSSrendercopy videovideo-enhance NAME 2093 191M 191M |▊ || || || | gnome-shell DRM minor 128 PID MEM RSSrendercopy videovideo-enhance NAME 2551 71M 71M |██▉|| || || | neverball 2553 50M 50M | || || || | Xwayland Example intel_gpu_top output, aggregated mode: intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 - 21/ 577 MHz; 71% RC6 8 irqs/s ENGINES BUSY MI_SEMA MI_WAIT Render/3D2.80% |▉ | 0% 0% Blitter0.01% |▏ | 0% 0% Video0.00% | | 0% 0% VideoEnhance0.00% | | 0% 0% PID MEM RSS Render/3D BlitterVideoNAME 50783 109M 107M |▎ ||||||| neverball Region breakdown mode (needs more width for best experience): intel-gpu-top: Intel Dg1 (Gen12) @ /dev/dri/card1 - 18/ 555 MHz; 65% RC6 8 irqs/s ENGINES BUSY MI_SEMA MI_WAIT Render/3D2.52% |▉ | 0% 0% Blitter0.00% | | 0% 0% Video0.00% | | 0% 0% VideoEnhance0.00% | | 0% 0% PID RAM RSS VRAM VRSS Video NAME 50783 34M 32M 75M 75M |▏ || || || | neverball v2: * Now actually per client. v3: * Track imported dma-buf objects. v4: * Rely on DRM GEM handles for tracking user objects. * Fix internal object accounting (no placements). v5: * Fixed brain fart of overwriting the loop cursor. * Fixed object destruction racing with fdinfo reads. * Take reference to GEM context while using it. v6: * Rebase, cover letter update. v7: * New patch in series for making region names consistent and stable. Test-with: 20230922134437.234888-1-tvrtko.ursu...@linux.intel.com Tvrtko Ursulin (6): drm/i915: Add ability for tracking buffer objects per client drm/i915: Record which client owns a VM drm/i915: Track page table backing store usage drm/i915: Account ring buffer and context state storage drm/i915: Add stable memory region names drm/i915: Implement fdinfo memory stats printing drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 +- .../gpu/drm/i915/gem/i915_gem_context_types.h | 3 + drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 ++- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 ++ .../gpu/drm/i915/gem/selftests/mock_context.c | 4 +- drivers/gpu/drm/i915/gt/intel_context.c | 14 +++ drivers/gpu/drm/i915/gt/intel_gtt.c | 6 + drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + drivers/gpu/drm/i915/i915_drm_client.c| 110 ++ drivers/gpu/drm/i915/i915_drm_client.h
Re: [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing
On 22/09/2023 09:48, Iddamsetty, Aravind wrote: On 21-09-2023 17:18, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. Objects with multiple possible placements are reported in multiple regions for total and shared sizes, while other categories are I guess you forgot to correct this. Ah yes, will fix. counted only for the currently active region. v2: * Only account against the active region. * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas) Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark Cc: Andi Shyti Cc: Tejas Upadhyay Reviewed-by: Andi Shyti # v1 --- drivers/gpu/drm/i915/i915_drm_client.c | 64 ++ 1 file changed, 64 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index a61356012df8..94abc2fb2ea6 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) +{ + const enum intel_region_id id = obj->mm.region ? + obj->mm.region->id : INTEL_REGION_SMEM; + const u64 sz = obj->base.size; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + DMA_RESV_USAGE_BOOKKEEP)) + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry(>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, >objects_list) { + obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj), +client_link)); + if (!obj) + continue; + obj_meminfo(obj, stats); + i915_gem_object_put(obj); + } + rcu_read_unlock(); + + for_each_memory_region(mr, i915, id) + drm_print_memory_stats(p, + [id], + DRM_GEM_OBJECT_RESIDENT | + DRM_GEM_OBJECT_PURGEABLE, + mr->name); +} + static const char * const uabi_class_names[] = { [I915_ENGINE_CLASS_RENDER] = "render", [I915_ENGINE_CLASS_COPY] = "copy", @@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) * ** */ + show_meminfo(p, file); + if (GRAPHICS_VER(i915) < 8) return; Reviewed-by: Aravind Iddamsetty Thank you! Would you be able to also look at the IGTs I posted yesterday? Regards, Tvrtko
[PATCH 5/5] drm/i915: Implement fdinfo memory stats printing
From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. Objects with multiple possible placements are reported in multiple regions for total and shared sizes, while other categories are counted only for the currently active region. v2: * Only account against the active region. * Use DMA_RESV_USAGE_BOOKKEEP when testing for active. (Tejas) Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark Cc: Andi Shyti Cc: Tejas Upadhyay Reviewed-by: Andi Shyti # v1 --- drivers/gpu/drm/i915/i915_drm_client.c | 64 ++ 1 file changed, 64 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index a61356012df8..94abc2fb2ea6 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,68 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) +{ + const enum intel_region_id id = obj->mm.region ? + obj->mm.region->id : INTEL_REGION_SMEM; + const u64 sz = obj->base.size; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + DMA_RESV_USAGE_BOOKKEEP)) + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry(>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, >objects_list) { + obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj), +client_link)); + if (!obj) + continue; + obj_meminfo(obj, stats); + i915_gem_object_put(obj); + } + rcu_read_unlock(); + + for_each_memory_region(mr, i915, id) + drm_print_memory_stats(p, + [id], + DRM_GEM_OBJECT_RESIDENT | + DRM_GEM_OBJECT_PURGEABLE, + mr->name); +} + static const char * const uabi_class_names[] = { [I915_ENGINE_CLASS_RENDER] = "render", [I915_ENGINE_CLASS_COPY] = "copy", @@ -106,6 +168,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) * ** */ + show_meminfo(p, file); + if (GRAPHICS_VER(i915) < 8) return; -- 2.39.2
[PATCH 2/5] drm/i915: Record which client owns a VM
From: Tvrtko Ursulin To enable accounting of indirect client memory usage (such as page tables) in the following patch, lets start recording the creator of each PPGTT. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 --- drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 3 +++ drivers/gpu/drm/i915/gem/selftests/mock_context.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 9a9ff84c90d7..35cf6608180e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -279,7 +279,8 @@ static int proto_context_set_protected(struct drm_i915_private *i915, } static struct i915_gem_proto_context * -proto_context_create(struct drm_i915_private *i915, unsigned int flags) +proto_context_create(struct drm_i915_file_private *fpriv, +struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_proto_context *pc, *err; @@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, unsigned int flags) if (!pc) return ERR_PTR(-ENOMEM); + pc->fpriv = fpriv; pc->num_user_engines = -1; pc->user_engines = NULL; pc->user_flags = BIT(UCONTEXT_BANNABLE) | @@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915, err = PTR_ERR(ppgtt); goto err_ctx; } + ppgtt->vm.fpriv = pc->fpriv; vm = >vm; } if (vm) @@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915, /* 0 reserved for invalid/unassigned ppgtt */ xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1); - pc = proto_context_create(i915, 0); + pc = proto_context_create(file_priv, i915, 0); if (IS_ERR(pc)) { err = PTR_ERR(pc); goto err; @@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data, GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */ args->vm_id = id; + ppgtt->vm.fpriv = file_priv; return 0; err_put: @@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, return -EIO; } - ext_data.pc = proto_context_create(i915, args->flags); + ext_data.pc = proto_context_create(file->driver_priv, i915, + args->flags); if (IS_ERR(ext_data.pc)) return PTR_ERR(ext_data.pc); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index cb78214a7dcd..c573c067779f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -188,6 +188,9 @@ struct i915_gem_proto_engine { * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE. */ struct i915_gem_proto_context { + /** @fpriv: Client which creates the context */ + struct drm_i915_file_private *fpriv; + /** @vm: See _gem_context.vm */ struct i915_address_space *vm; diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c b/drivers/gpu/drm/i915/gem/selftests/mock_context.c index 8ac6726ec16b..125584ada282 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c @@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file) int err; u32 id; - pc = proto_context_create(i915, 0); + pc = proto_context_create(fpriv, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); @@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915, struct i915_gem_context *ctx; struct i915_gem_proto_context *pc; - pc = proto_context_create(i915, 0); + pc = proto_context_create(NULL, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 346ec8ec2edd..8cf62f5134a9 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -248,6 +248,7 @@ struct i915_address_space { struct drm_mm mm; struct intel_gt *gt; struct drm_i915_private *i915; + struct drm_i915_file_private *fpriv; struct device *dma; u64 total; /* size addr space maps (ex. 2GB for ggtt) */ u64 reserved; /* size addr space reserved */ -- 2.39.2
[PATCH 4/5] drm/i915: Account ring buffer and context state storage
From: Tvrtko Ursulin Account ring buffers and logical context space against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_context.c | 14 ++ drivers/gpu/drm/i915/i915_drm_client.c | 10 ++ drivers/gpu/drm/i915/i915_drm_client.h | 9 + 3 files changed, 33 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index a53b26178f0a..a2f1245741bb 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -6,6 +6,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "i915_drm_client.h" #include "i915_drv.h" #include "i915_trace.h" @@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine) int intel_context_alloc_state(struct intel_context *ce) { + struct i915_gem_context *ctx; int err = 0; if (mutex_lock_interruptible(>pin_mutex)) @@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce) goto unlock; set_bit(CONTEXT_ALLOC_BIT, >flags); + + rcu_read_lock(); + ctx = rcu_dereference(ce->gem_context); + if (ctx && !kref_get_unless_zero(>ref)) + ctx = NULL; + rcu_read_unlock(); + if (ctx) { + if (ctx->client) + i915_drm_client_add_context_objects(ctx->client, + ce); + i915_gem_context_put(ctx); + } } unlock: diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2e5e69edc0f9..a61356012df8 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) return true; } + +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce) +{ + if (ce->state) + i915_drm_client_add_object(client, ce->state->obj); + + if (ce->ring != ce->engine->legacy.ring && ce->ring->vma) + i915_drm_client_add_object(client, ce->ring->vma->obj); +} #endif diff --git a/drivers/gpu/drm/i915/i915_drm_client.h b/drivers/gpu/drm/i915/i915_drm_client.h index 5f58fdf7dcb8..69cedfcd3d69 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.h +++ b/drivers/gpu/drm/i915/i915_drm_client.h @@ -14,6 +14,7 @@ #include "i915_file_private.h" #include "gem/i915_gem_object_types.h" +#include "gt/intel_context_types.h" #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE @@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file); void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj); bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj); +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce); #else static inline void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj) @@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct i915_drm_client *client, static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) { } + +static inline void +i915_drm_client_add_context_objects(struct i915_drm_client *client, + struct intel_context *ce) +{ +} #endif #endif /* !__I915_DRM_CLIENT_H__ */ -- 2.39.2
[PATCH 3/5] drm/i915: Track page table backing store usage
From: Tvrtko Ursulin Account page table backing store against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 13944a14ea2d..c3f2b379 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; @@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; -- 2.39.2
[PATCH 1/5] drm/i915: Add ability for tracking buffer objects per client
From: Tvrtko Ursulin In order to show per client memory usage lets add some infrastructure which enables tracking buffer objects owned by clients. We add a per client list protected by a new per client lock and to support delayed destruction (post client exit) we make tracked objects hold references to the owning client. Also, object memory region teardown is moved to the existing RCU free callback to allow safe dereference from the fdinfo RCU read section. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +-- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 +++ drivers/gpu/drm/i915/i915_drm_client.c| 36 +++ drivers/gpu/drm/i915/i915_drm_client.h| 32 + 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index c26d87555825..25eeeb863209 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -106,6 +106,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(>mm.link); +#ifdef CONFIG_PROC_FS + INIT_LIST_HEAD(>client_link); +#endif + INIT_LIST_HEAD(>lut_list); spin_lock_init(>lut_lock); @@ -293,6 +297,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head) container_of(head, typeof(*obj), rcu); struct drm_i915_private *i915 = to_i915(obj->base.dev); + /* We need to keep this alive for RCU read access from fdinfo. */ + if (obj->mm.n_placements > 1) + kfree(obj->mm.placements); + i915_gem_object_free(obj); GEM_BUG_ON(!atomic_read(>mm.free_count)); @@ -389,9 +397,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj) if (obj->ops->release) obj->ops->release(obj); - if (obj->mm.n_placements > 1) - kfree(obj->mm.placements); - if (obj->shares_resv_from) i915_vm_resv_put(obj->shares_resv_from); @@ -442,6 +447,8 @@ static void i915_gem_free_object(struct drm_gem_object *gem_obj) GEM_BUG_ON(i915_gem_object_is_framebuffer(obj)); + i915_drm_client_remove_object(obj); + /* * Before we free the object, make sure any pure RCU-only * read-side critical sections are complete, e.g. diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 2292404007c8..0c5cdab278b6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -302,6 +302,18 @@ struct drm_i915_gem_object { */ struct i915_address_space *shares_resv_from; +#ifdef CONFIG_PROC_FS + /** +* @client: @i915_drm_client which created the object +*/ + struct i915_drm_client *client; + + /** +* @client_link: Link into @i915_drm_client.objects_list +*/ + struct list_head client_link; +#endif + union { struct rcu_head rcu; struct llist_node freed; diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2a44b3876cb5..2e5e69edc0f9 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void) kref_init(>kref); spin_lock_init(>ctx_lock); INIT_LIST_HEAD(>ctx_list); +#ifdef CONFIG_PROC_FS + spin_lock_init(>objects_lock); + INIT_LIST_HEAD(>objects_list); +#endif return client; } @@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++) show_client_class(p, i915, file_priv->client, i); } + +void i915_drm_client_add_object(struct i915_drm_client *client, + struct drm_i915_gem_object *obj) +{ + unsigned long flags; + + GEM_WARN_ON(obj->client); + GEM_WARN_ON(!list_empty(>client_link)); + + spin_lock_irqsave(>objects_lock, flags); + obj->client = i915_drm_client_get(client); + list_add_tail_rcu(>client_link, >objects_list); + spin_unlock_irqrestore(>objects_lock, flags); +} + +bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) +{ + struct i915_drm_client *client = fetch_and_zero(>client); + unsigned long flags; + + /* Object may not be associated with a client. */ + if (!client) + return false; + + spin_lock_irqsave(>objects_lock, flags); + list_del_rcu(>client_link); + spin_unlock_irqrestore(>objects_lock, flags); + + i915_drm_client_put(client); +
[PATCH v7 0/5] fdinfo memory stats
From: Tvrtko Ursulin A short series to enable fdinfo memory stats for i915. I added tracking of most classes of objects (user objects, page tables, context state, ring buffers) which contribute to client's memory footprint and am accouting their memory use along the similar lines as in Rob's msm code, just that with i915 specific code we can show a memory region breakdown and so support discrete and multi-tile GPUs properly. And also reflect that our objects can have multiple allowed backing stores. The existing helper Rob added is then used to dump the per memory region stats to fdinfo. The basic objects-per-client infrastructure can later be extended to cover all objects and so avoid needing to walk the IDR under the client's file table lock, which would further avoid distburbing the running clients by parallel fdinfo readers. Example fdinfo format: # cat /proc/1383/fdinfo/8 pos:0 flags: 0212 mnt_id: 21 ino:397 drm-driver: i915 drm-client-id: 18 drm-pdev: :00:02.0 drm-total-system: 125 MiB drm-shared-system: 16 MiB drm-active-system: 110 MiB drm-resident-system:125 MiB drm-purgeable-system: 2 MiB drm-total-stolen-system:0 drm-shared-stolen-system: 0 drm-active-stolen-system: 0 drm-resident-stolen-system: 0 drm-purgeable-stolen-system:0 drm-engine-render: 25662044495 ns drm-engine-copy:0 ns drm-engine-video: 0 ns drm-engine-video-enhance: 0 ns Example gputop output: DRM minor 0 PID SMEM SMEMRSS render copy videoNAME 1233 124M 124M |||||||| neverball 1130 59M 59M |█▌ ||||||| Xorg 1207 12M 12M |||||||| xfwm4 Or with Wayland: DRM minor 0 PID MEM RSSrendercopy videovideo-enhance NAME 2093 191M 191M |▊ || || || | gnome-shell DRM minor 128 PID MEM RSSrendercopy videovideo-enhance NAME 2551 71M 71M |██▉|| || || | neverball 2553 50M 50M | || || || | Xwayland v2: * Now actually per client. v3: * Track imported dma-buf objects. v4: * Rely on DRM GEM handles for tracking user objects. * Fix internal object accounting (no placements). v5: * Fixed brain fart of overwriting the loop cursor. * Fixed object destruction racing with fdinfo reads. * Take reference to GEM context while using it. v6: * Rebase, cover letter update. v7: * Account against active region only. * Cover all dma_resv usage when testing for activity. Test-with: 20230921114557.192629-1-tvrtko.ursu...@linux.intel.com Tvrtko Ursulin (5): drm/i915: Add ability for tracking buffer objects per client drm/i915: Record which client owns a VM drm/i915: Track page table backing store usage drm/i915: Account ring buffer and context state storage drm/i915: Implement fdinfo memory stats printing drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 +- .../gpu/drm/i915/gem/i915_gem_context_types.h | 3 + drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 ++- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 ++ .../gpu/drm/i915/gem/selftests/mock_context.c | 4 +- drivers/gpu/drm/i915/gt/intel_context.c | 14 +++ drivers/gpu/drm/i915/gt/intel_gtt.c | 6 + drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + drivers/gpu/drm/i915/i915_drm_client.c| 110 ++ drivers/gpu/drm/i915/i915_drm_client.h| 41 +++ 10 files changed, 207 insertions(+), 8 deletions(-) -- 2.39.2
Re: [Intel-gfx] [PATCH] drm/i915/gem: Allow users to disable waitboost
On 20/09/2023 22:56, Vinay Belgaumkar wrote: Provide a bit to disable waitboost while waiting on a gem object. Waitboost results in increased power consumption by requesting RP0 while waiting for the request to complete. Add a bit in the gem_wait() IOCTL where this can be disabled. This is related to the libva API change here - Link: https://github.com/XinfengZhang/libva/commit/3d90d18c67609a73121bb71b20ee4776b54b61a7 This link does not appear to lead to userspace code using this uapi? Cc: Rodrigo Vivi Signed-off-by: Vinay Belgaumkar --- drivers/gpu/drm/i915/gem/i915_gem_wait.c | 9 ++--- drivers/gpu/drm/i915/i915_request.c | 3 ++- drivers/gpu/drm/i915/i915_request.h | 1 + include/uapi/drm/i915_drm.h | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c index d4b918fb11ce..955885ec859d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c @@ -72,7 +72,8 @@ i915_gem_object_wait_reservation(struct dma_resv *resv, struct dma_fence *fence; long ret = timeout ?: 1; - i915_gem_object_boost(resv, flags); + if (!(flags & I915_WAITBOOST_DISABLE)) + i915_gem_object_boost(resv, flags); dma_resv_iter_begin(, resv, dma_resv_usage_rw(flags & I915_WAIT_ALL)); @@ -236,7 +237,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ktime_t start; long ret; - if (args->flags != 0) + if (args->flags != 0 || args->flags != I915_GEM_WAITBOOST_DISABLE) return -EINVAL; obj = i915_gem_object_lookup(file, args->bo_handle); @@ -248,7 +249,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE | I915_WAIT_PRIORITY | - I915_WAIT_ALL, + I915_WAIT_ALL | + (args->flags & I915_GEM_WAITBOOST_DISABLE ? + I915_WAITBOOST_DISABLE : 0), to_wait_timeout(args->timeout_ns)); if (args->timeout_ns > 0) { diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index f59081066a19..2957409b4b2a 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -2044,7 +2044,8 @@ long i915_request_wait_timeout(struct i915_request *rq, * but at a cost of spending more power processing the workload * (bad for battery). */ - if (flags & I915_WAIT_PRIORITY && !i915_request_started(rq)) + if (!(flags & I915_WAITBOOST_DISABLE) && (flags & I915_WAIT_PRIORITY) && + !i915_request_started(rq)) intel_rps_boost(rq); wait.tsk = current; diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 0ac55b2e4223..3cc00e8254dc 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -445,6 +445,7 @@ long i915_request_wait(struct i915_request *rq, #define I915_WAIT_INTERRUPTIBLE BIT(0) #define I915_WAIT_PRIORITYBIT(1) /* small priority bump for the request */ #define I915_WAIT_ALL BIT(2) /* used by i915_gem_object_wait() */ +#define I915_WAITBOOST_DISABLE BIT(3) /* used by i915_gem_object_wait() */ void i915_request_show(struct drm_printer *m, const struct i915_request *rq, diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 7000e5910a1d..4adee70e39cf 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1928,6 +1928,7 @@ struct drm_i915_gem_wait { /** Handle of BO we shall wait on */ __u32 bo_handle; __u32 flags; +#define I915_GEM_WAITBOOST_DISABLE (1u<<0) Probably would be good to avoid mentioning waitboost in the uapi since so far it wasn't an explicit feature/contract. Something like I915_GEM_WAIT_BACKGROUND_PRIORITY? Low priority? I also wonder if there could be a possible angle to help Rob (+cc) upstream the syncobj/fence deadline code if our media driver might make use of that somehow. Like if either we could wire up the deadline into GEM_WAIT (in a backward compatible manner), or if media could use sync fd wait instead. Assuming they have an out fence already, which may not be true. Regards, Tvrtko /** Number of nanoseconds to wait, Returns time remaining. */ __s64 timeout_ns; };
Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats
On 20/09/2023 16:32, Tvrtko Ursulin wrote: On 20/09/2023 00:34, Adrián Larumbe wrote: The current implementation will try to pick the highest available size display unit as soon as the BO size exceeds that of the previous multiplier. That can lead to loss of precision in contexts of low memory usage. The new selection criteria try to preserve precision, whilst also increasing the display unit selection threshold to render more accurate values. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/drm_file.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 762965e3d503..34cfa128ffe5 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) } EXPORT_SYMBOL(drm_send_event); +#define UPPER_UNIT_THRESHOLD 100 + static void print_size(struct drm_printer *p, const char *stat, const char *region, u64 sz) { @@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p, const char *stat, unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { - if (sz < SZ_1K) + if ((sz & (SZ_1K - 1)) && IS_ALIGNED worth it at all? + sz < UPPER_UNIT_THRESHOLD * SZ_1K) break; Excuse me for a late comment (I was away). I did not get what what is special about a ~10% threshold? Sounds to me just going with the lower unit, when size is not aligned to the higher one, would be better than sometimes precision-sometimes-not. FWIW both current and the threshold option make testing the feature very annoying. So I'd really propose we simply use smaller unit when unaligned. Regards, Tvrtko
Re: [PATCH v6 4/6] drm/drm_file: Add DRM obj's RSS reporting function for fdinfo
On 20/09/2023 00:34, Adrián Larumbe wrote: Some BO's might be mapped onto physical memory chunkwise and on demand, like Panfrost's tiler heap. In this case, even though the drm_gem_shmem_object page array might already be allocated, only a very small fraction of the BO is currently backed by system memory, but drm_show_memory_stats will then proceed to add its entire virtual size to the file's total resident size regardless. This led to very unrealistic RSS sizes being reckoned for Panfrost, where said tiler heap buffer is initially allocated with a virtual size of 128 MiB, but only a small part of it will eventually be backed by system memory after successive GPU page faults. Provide a new DRM object generic function that would allow drivers to return a more accurate RSS size for their BOs. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/drm_file.c | 5 - include/drm/drm_gem.h | 9 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 883d83bc0e3d..762965e3d503 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -944,7 +944,10 @@ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file) } if (s & DRM_GEM_OBJECT_RESIDENT) { - status.resident += obj->size; + if (obj->funcs && obj->funcs->rss) + status.resident += obj->funcs->rss(obj); + else + status.resident += obj->size; Presumably you'd want the same smaller size in both active and purgeable? Or you can end up with more in those two than in rss which would look odd. Also, alternative to adding a new callback could be adding multiple output parameters to the existing obj->func->status() which maybe ends up simpler due fewer callbacks? Like: s = obj->funcs->status(obj, _status, ) And adjust the code flow to pick up the rss if driver signaled it supports reporting it. Regards, Tvrtko } else { /* If already purged or not yet backed by pages, don't * count it as purgeable: diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index bc9f6aa2f3fe..16364487fde9 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -208,6 +208,15 @@ struct drm_gem_object_funcs { */ enum drm_gem_object_status (*status)(struct drm_gem_object *obj); + /** +* @rss: +* +* Return resident size of the object in physical memory. +* +* Called by drm_show_memory_stats(). +*/ + size_t (*rss)(struct drm_gem_object *obj); + /** * @vm_ops: *
Re: [PATCH v6 2/6] drm/panfrost: Add fdinfo support GPU load metrics
On 20/09/2023 00:34, Adrián Larumbe wrote: The drm-stats fdinfo tags made available to user space are drm-engine, drm-cycles, drm-max-freq and drm-curfreq, one per job slot. This deviates from standard practice in other DRM drivers, where a single set of key:value pairs is provided for the whole render engine. However, Panfrost has separate queues for fragment and vertex/tiler jobs, so a decision was made to calculate bus cycles and workload times separately. Maximum operating frequency is calculated at devfreq initialisation time. Current frequency is made available to user space because nvtop uses it when performing engine usage calculations. It is important to bear in mind that both GPU cycle and kernel time numbers provided are at best rough estimations, and always reported in excess from the actual figure because of two reasons: - Excess time because of the delay between the end of a job processing, the subsequent job IRQ and the actual time of the sample. - Time spent in the engine queue waiting for the GPU to pick up the next job. To avoid race conditions during enablement/disabling, a reference counting mechanism was introduced, and a job flag that tells us whether a given job increased the refcount. This is necessary, because user space can toggle cycle counting through a debugfs file, and a given job might have been in flight by the time cycle counting was disabled. The main goal of the debugfs cycle counter knob is letting tools like nvtop or IGT's gputop switch it at any time, to avoid power waste in case no engine usage measuring is necessary. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/panfrost/Makefile | 2 + drivers/gpu/drm/panfrost/panfrost_debugfs.c | 20 drivers/gpu/drm/panfrost/panfrost_debugfs.h | 13 + drivers/gpu/drm/panfrost/panfrost_devfreq.c | 8 +++ drivers/gpu/drm/panfrost/panfrost_devfreq.h | 3 ++ drivers/gpu/drm/panfrost/panfrost_device.c | 2 + drivers/gpu/drm/panfrost/panfrost_device.h | 13 + drivers/gpu/drm/panfrost/panfrost_drv.c | 57 - drivers/gpu/drm/panfrost/panfrost_gpu.c | 41 +++ drivers/gpu/drm/panfrost/panfrost_gpu.h | 4 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 24 + drivers/gpu/drm/panfrost/panfrost_job.h | 5 ++ 12 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.c create mode 100644 drivers/gpu/drm/panfrost/panfrost_debugfs.h diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile index 7da2b3f02ed9..2c01c1e7523e 100644 --- a/drivers/gpu/drm/panfrost/Makefile +++ b/drivers/gpu/drm/panfrost/Makefile @@ -12,4 +12,6 @@ panfrost-y := \ panfrost_perfcnt.o \ panfrost_dump.o +panfrost-$(CONFIG_DEBUG_FS) += panfrost_debugfs.o + obj-$(CONFIG_DRM_PANFROST) += panfrost.o diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.c b/drivers/gpu/drm/panfrost/panfrost_debugfs.c new file mode 100644 index ..cc14eccba206 --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2023 Collabora ltd. */ + +#include +#include +#include +#include +#include + +#include "panfrost_device.h" +#include "panfrost_gpu.h" +#include "panfrost_debugfs.h" + +void panfrost_debugfs_init(struct drm_minor *minor) +{ + struct drm_device *dev = minor->dev; + struct panfrost_device *pfdev = platform_get_drvdata(to_platform_device(dev->dev)); + + debugfs_create_atomic_t("profile", 0600, minor->debugfs_root, >profile_mode); +} diff --git a/drivers/gpu/drm/panfrost/panfrost_debugfs.h b/drivers/gpu/drm/panfrost/panfrost_debugfs.h new file mode 100644 index ..db1c158bcf2f --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_debugfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2023 Collabora ltd. + */ + +#ifndef PANFROST_DEBUGFS_H +#define PANFROST_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +void panfrost_debugfs_init(struct drm_minor *minor); +#endif + +#endif /* PANFROST_DEBUGFS_H */ diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c index 58dfb15a8757..28caffc689e2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c @@ -58,6 +58,7 @@ static int panfrost_devfreq_get_dev_status(struct device *dev, spin_lock_irqsave(>lock, irqflags); panfrost_devfreq_update_utilization(pfdevfreq); + pfdevfreq->current_frequency = status->current_frequency; status->total_time = ktime_to_ns(ktime_add(pfdevfreq->busy_time, pfdevfreq->idle_time)); @@ -117,6 +118,7 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev) struct devfreq *devfreq; struct
Re: [PATCH v6 6/6] drm/drm-file: Show finer-grained BO sizes in drm_show_memory_stats
On 20/09/2023 00:34, Adrián Larumbe wrote: The current implementation will try to pick the highest available size display unit as soon as the BO size exceeds that of the previous multiplier. That can lead to loss of precision in contexts of low memory usage. The new selection criteria try to preserve precision, whilst also increasing the display unit selection threshold to render more accurate values. Signed-off-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price --- drivers/gpu/drm/drm_file.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 762965e3d503..34cfa128ffe5 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -872,6 +872,8 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) } EXPORT_SYMBOL(drm_send_event); +#define UPPER_UNIT_THRESHOLD 100 + static void print_size(struct drm_printer *p, const char *stat, const char *region, u64 sz) { @@ -879,7 +881,8 @@ static void print_size(struct drm_printer *p, const char *stat, unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { - if (sz < SZ_1K) + if ((sz & (SZ_1K - 1)) && IS_ALIGNED worth it at all? + sz < UPPER_UNIT_THRESHOLD * SZ_1K) break; Excuse me for a late comment (I was away). I did not get what what is special about a ~10% threshold? Sounds to me just going with the lower unit, when size is not aligned to the higher one, would be better than sometimes precision-sometimes-not. Regards, Tvrtko sz = div_u64(sz, SZ_1K); }
Re: [Intel-gfx] [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing
On 24/08/2023 12:35, Upadhyay, Tejas wrote: -Original Message- From: Intel-gfx On Behalf Of Tvrtko Ursulin Sent: Friday, July 7, 2023 6:32 PM To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org Subject: [Intel-gfx] [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. Objects with multiple possible placements are reported in multiple regions for total and shared sizes, while other categories are counted only for the currently active region. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark --- drivers/gpu/drm/i915/i915_drm_client.c | 85 ++ 1 file changed, 85 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index ffccb6239789..5c77d6987d90 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) { + struct intel_memory_region *mr; + u64 sz = obj->base.size; + enum intel_region_id id; + unsigned int i; + + /* Attribute size and shared to all possible memory regions. */ + for (i = 0; i < obj->mm.n_placements; i++) { + mr = obj->mm.placements[i]; + id = mr->id; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + /* Attribute other categories to only the current region. */ + mr = obj->mm.region; + if (mr) + id = mr->id; + else + id = INTEL_REGION_SMEM; + + if (!obj->mm.n_placements) { + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + dma_resv_usage_rw(true))) Should not DMA_RESV_USAGE_BOOKKEEP also considered active (why only "rw")? Some app is syncing with syncjobs and has added dma_fence with DMA_RESV_USAGE_BOOKKEEP during execbuf while that BO is busy on waiting on work! Hmm do we have a path which adds DMA_RESV_USAGE_BOOKKEEP usage in execbuf? Rob, any comments here? Given how I basically lifted the logic from 686b21b5f6ca ("drm: Add fdinfo memory stats"), does it sound plausible to upgrade the test against all fences? Regards, Tvrtko + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry (>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, >objects_list) { + obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj), +client_link)); + if (!obj) + continue; + obj_meminfo(obj, stats); + i915_gem_object_put(obj); + } + rcu_read_unlock(); + + for_each_memory_region(mr, i915, id) + drm_print_memory_stats(p, + [id], + DRM_GEM_OBJECT_RESIDENT | + DRM_GEM_OBJECT_PURGEABLE, + mr->name); +} + static const char * const uabi_class_names[] = { [I915_ENGINE_CLASS_RENDER] = "render", [I915_ENGINE_CLASS_COPY
Re: [PATCH v2] drm: Update file owner during use
On 28/08/2023 20:58, Rob Clark wrote: On Wed, Jun 21, 2023 at 2:48 AM Tvrtko Ursulin wrote: From: Tvrtko Ursulin With the typical model where the display server opens the file descriptor and then hands it over to the client(*), we were showing stale data in debugfs. Fix it by updating the drm_file->pid on ioctl access from a different process. The field is also made RCU protected to allow for lockless readers. Update side is protected with dev->filelist_mutex. Before: $ cat /sys/kernel/debug/dri/0/clients command pid dev master a uid magic Xorg 2344 0 yy 0 0 Xorg 2344 0 ny 0 2 Xorg 2344 0 ny 0 3 Xorg 2344 0 ny 0 4 After: $ cat /sys/kernel/debug/dri/0/clients command tgid dev master a uid magic Xorg 830 0 yy 0 0 xfce4-session 880 0 ny 0 1 xfwm4 943 0 ny 0 2 neverball 1095 0 ny 0 3 *) More detailed and historically accurate description of various handover implementation kindly provided by Emil Velikov: """ The traditional model, the server was the orchestrator managing the primary device node. From the fd, to the master status and authentication. But looking at the fd alone, this has varied across the years. IIRC in the DRI1 days, Xorg (libdrm really) would have a list of open fd(s) and reuse those whenever needed, DRI2 the client was responsible for open() themselves and with DRI3 the fd was passed to the client. Around the inception of DRI3 and systemd-logind, the latter became another possible orchestrator. Whereby Xorg and Wayland compositors could ask it for the fd. For various reasons (hysterical and genuine ones) Xorg has a fallback path going the open(), whereas Wayland compositors are moving to solely relying on logind... some never had fallback even. Over the past few years, more projects have emerged which provide functionality similar (be that on API level, Dbus, or otherwise) to systemd-logind. """ v2: * Fixed typo in commit text and added a fine historical explanation from Emil. Signed-off-by: Tvrtko Ursulin Cc: "Christian König" Cc: Daniel Vetter Acked-by: Christian König Reviewed-by: Emil Velikov Reviewed-by: Rob Clark Tested-by: Rob Clark Thanks. If everyone else is happy with this approach I don't have the commit rights for drm-misc. Regards, Tvrtko --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 6 ++-- drivers/gpu/drm/drm_auth.c | 3 +- drivers/gpu/drm/drm_debugfs.c | 10 --- drivers/gpu/drm/drm_file.c | 40 +++-- drivers/gpu/drm/drm_ioctl.c | 3 ++ drivers/gpu/drm/nouveau/nouveau_drm.c | 5 +++- drivers/gpu/drm/vmwgfx/vmwgfx_gem.c | 6 ++-- include/drm/drm_file.h | 13 ++-- 8 files changed, 71 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 74055cba3dc9..849097dff02b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -963,6 +963,7 @@ static int amdgpu_debugfs_gem_info_show(struct seq_file *m, void *unused) list_for_each_entry(file, >filelist, lhead) { struct task_struct *task; struct drm_gem_object *gobj; + struct pid *pid; int id; /* @@ -972,8 +973,9 @@ static int amdgpu_debugfs_gem_info_show(struct seq_file *m, void *unused) * Therefore, we need to protect this ->comm access using RCU. */ rcu_read_lock(); - task = pid_task(file->pid, PIDTYPE_TGID); - seq_printf(m, "pid %8d command %s:\n", pid_nr(file->pid), + pid = rcu_dereference(file->pid); + task = pid_task(pid, PIDTYPE_TGID); + seq_printf(m, "pid %8d command %s:\n", pid_nr(pid), task ? task->comm : ""); rcu_read_unlock(); diff --git a/drivers/gpu/drm/drm_auth.c b/drivers/gpu/drm/drm_auth.c index cf92a9ae8034..2ed2585ded37 100644 --- a/drivers/gpu/drm/drm_auth.c +++ b/drivers/gpu/drm/drm_auth.c @@ -235,7 +235,8 @@ static int drm_new_set_master(struct drm_device *dev, struct drm_file *fpriv) static int drm_master_check_perm(struct drm_device *dev, struct drm_file *file_priv) { - if (file_priv->pid == task_pid(current) && file_priv->was_master) + if (file_priv->was_master && + rcu_access_pointer(file_priv->pid) == task_pid(current)) return 0; if (!capable(CAP_SYS_
Re: [Intel-gfx] [PATCH] drm/i915: Do not disable preemption for resets
On 13/09/2023 18:04, Valentin Schneider wrote: On Wed, 13 Sept 2023 at 18:48, Sebastian Andrzej Siewior wrote: On 2023-07-05 10:30:25 [+0100], Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit ade8a0f59844 ("drm/i915: Make all GPU resets atomic") added a preempt disable section over the hardware reset callback to prepare the driver for being able to reset from atomic contexts. … This missed the v6.6 merge window. Has this been dropped for some reason or just missed by chance? Can this be still applied, please? Just an FYI, but I happened to be looking at an internal bug report for exactly this error site, so +1 here :) It looks I failed to collect an r-b before the summer break and so it fell off my radar. Definitely want to merge it so I will try again. Regards, Tvrtko
[PATCH] drm/i915: Zap some empty lines
From: Tvrtko Ursulin Recent refactoring left an unsightly block of empty lines. Remove them. Signed-off-by: Tvrtko Ursulin Cc: Dnyaneshwar Bhadane Cc: Anusha Srivatsa Cc: Radhakrishna Sripada --- drivers/gpu/drm/i915/i915_drv.h | 7 --- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 87ffc477c3b1..511eba3bbdba 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -646,13 +646,6 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, #define IS_TIGERLAKE_UY(i915) \ IS_SUBPLATFORM(i915, INTEL_TIGERLAKE, INTEL_SUBPLATFORM_UY) - - - - - - - #define IS_XEHPSDV_GRAPHICS_STEP(__i915, since, until) \ (IS_XEHPSDV(__i915) && IS_GRAPHICS_STEP(__i915, since, until)) -- 2.39.2
Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
On 03/08/2023 15:43, Matthew Brost wrote: On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote: On 01/08/2023 21:50, Matthew Brost wrote: In XE, the new Intel GPU driver, a choice has made to have a 1 to 1 mapping between a drm_gpu_scheduler and drm_sched_entity. At first this seems a bit odd but let us explain the reasoning below. 1. In XE the submission order from multiple drm_sched_entity is not guaranteed to be the same completion even if targeting the same hardware engine. This is because in XE we have a firmware scheduler, the GuC, which allowed to reorder, timeslice, and preempt submissions. If a using shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls apart as the TDR expects submission order == completion order. Using a dedicated drm_gpu_scheduler per drm_sched_entity solve this problem. 2. In XE submissions are done via programming a ring buffer (circular buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow control on the ring for free. A problem with this design is currently a drm_gpu_scheduler uses a kthread for submission / job cleanup. This doesn't scale if a large number of drm_gpu_scheduler are used. To work around the scaling issue, use a worker rather than kthread for submission / job cleanup. v2: - (Rob Clark) Fix msm build - Pass in run work queue v3: - (Boris) don't have loop in worker Signed-off-by: Matthew Brost --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- drivers/gpu/drm/lima/lima_sched.c | 2 +- drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +- drivers/gpu/drm/msm/msm_ringbuffer.c| 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 136 +++- drivers/gpu/drm/v3d/v3d_sched.c | 10 +- include/drm/gpu_scheduler.h | 14 +- 10 files changed, 113 insertions(+), 89 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index f60753f97ac5..9c2a10aeb0b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) for (i = 0; i < AMDGPU_MAX_RINGS; i++) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !ring->sched.ready) continue; - kthread_park(ring->sched.thread); + drm_sched_run_wq_stop(>sched); It would be good to split out adding of these wrappers (including adding one for ring->sched.thread/ready) to a standalong preceding patch. That way at least some mechanical changes to various drivers would be separated from functional changes. Sure. Also, perhaps do not have the wq in the name if it is not really needed to be verbose with the underlying implementation like that? Like would drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea. Sure. } seq_printf(m, "run ib test:\n"); @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) for (i = 0; i < AMDGPU_MAX_RINGS; i++) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !ring->sched.ready) continue; - kthread_unpark(ring->sched.thread); + drm_sched_run_wq_start(>sched); } up_write(>reset_domain->sem); @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) ring = adev->rings[val]; - if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread) + if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready) return -EINVAL; /* the last preemption failed */ @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) goto pro_end; /* stop the scheduler */ - kthread_park(ring->sched.thread); + drm_sched_run_wq_stop(>sched); /* preempt the IB */ r = amdgpu_ring_preempt_ib(ring); @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) failure: /* restart the scheduler */ - kthread_unpark(ring->sched.thread); + drm_sched_run_wq_start(>sched); up_read(>reset_domain->sem); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fac9312b1695..00c9c03c8f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amd
Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
On 03/08/2023 15:56, Christian König wrote: Am 03.08.23 um 16:43 schrieb Matthew Brost: On Thu, Aug 03, 2023 at 11:11:13AM +0100, Tvrtko Ursulin wrote: On 01/08/2023 21:50, Matthew Brost wrote: [SNIP] sched->ops = ops; sched->hw_submission_limit = hw_submission; sched->name = name; + sched->run_wq = run_wq ? : system_wq; I still think it is not nice to implicitly move everyone over to the shared system wq. Maybe even more so with now one at a time execution, since effect on latency can be even greater. No one that has a stake in this has pushed back that I can recall. Open to feedback stakeholders (maintainers of drivers that use the drm scheduler). > No objections to using the system_wq here. Drivers can still pass in their own or simply use system_highpri_wq instead. Additional to that the system_wq isn't single threaded, it will create as much threads as needed to fully utilize all CPUs. The i915 doesn't use the DRM scheduler last time I looked. Has that changed? Have you considered kthread_work as a backend? Maybe it would work to have callers pass in a kthread_worker they create, or provide a drm_sched helper to create one, which would then be passed to drm_sched_init. That would enable per driver kthread_worker, or per device, or whatever granularity each driver would want/need/desire. driver init: struct drm_sched_worker = drm_sched_create_worker(...); queue/whatever init: drm_sched_init(.., worker, ...); This idea doesn't seem to work for varitey of reasons. Will type it out if needed but not going to spend time on this unless someone with a stake raises this as an issue. Agree completely. kthread_work is for real time workers IIRC. AFAIK it is indicated if one needs to tweak the kthread priority, but that is not the only use case. I am curious to know why the idea does not work for variety of reasons. You could create one inside drm_sched_init if not passed in, which would keep the behaviour for existing drivers more similar - they would still have a 1:1 kthread context for their exclusive use. Part of the idea of a work queue is so a user can't directly create a kthread via an IOCTL (XE_EXEC_QUEUE_CREATE). What you suggesting exposes this issue. Yeah, prevent that is indeed a very good idea. Nope, I wasn't suggesting that at all. I was suggesting as many kthread_workers (these are threads) as the implementation wants. Xe can create one per device. Someone else can create one per hw engine, whatever. One kthread_*work* per entity does not mean one thread per XE_EXEC_QUEUE_CREATE. Kthread_work is just a unit of work executed by the kthread_worker thread. Same in that conceptual relationship as workqueue and workitem. Difference is it may work better for single-shot re-arming design if regression in submission latency concerns any stakeholders. And I *think* self-re-arming would be less problematic latency wise since kthread_worker consumes everything queued without relinquishing control and execution context would be guaranteed not to be shared with random system stuff. So this is essentially so we can use a loop? Seems like a lot effort for what is pure speculation. Again if a stakeholder raises an issue we can address then. Instead of a loop what you usually do in the worker is to submit one item (if possible) and then re-queue yourself if there is more work to do. This way you give others chance to run as well and/or cancel the work etc... Yeah I was pointing out loop in the worker was bad months ago (or more) so it is not about that. Here my point is whether it can be done better than silently convert everyone to system_wq. Hence my proposal is to *keep* closer to the thread semantics for everyone and at the same time _allow_ the option of custom workqueue/whatever. Where is the problem there? Regards, Tvrtko
Re: [PATCH 1/8] drm/sched: Convert drm scheduler to use a work queue rather than kthread
On 01/08/2023 21:50, Matthew Brost wrote: In XE, the new Intel GPU driver, a choice has made to have a 1 to 1 mapping between a drm_gpu_scheduler and drm_sched_entity. At first this seems a bit odd but let us explain the reasoning below. 1. In XE the submission order from multiple drm_sched_entity is not guaranteed to be the same completion even if targeting the same hardware engine. This is because in XE we have a firmware scheduler, the GuC, which allowed to reorder, timeslice, and preempt submissions. If a using shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls apart as the TDR expects submission order == completion order. Using a dedicated drm_gpu_scheduler per drm_sched_entity solve this problem. 2. In XE submissions are done via programming a ring buffer (circular buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow control on the ring for free. A problem with this design is currently a drm_gpu_scheduler uses a kthread for submission / job cleanup. This doesn't scale if a large number of drm_gpu_scheduler are used. To work around the scaling issue, use a worker rather than kthread for submission / job cleanup. v2: - (Rob Clark) Fix msm build - Pass in run work queue v3: - (Boris) don't have loop in worker Signed-off-by: Matthew Brost --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- drivers/gpu/drm/lima/lima_sched.c | 2 +- drivers/gpu/drm/msm/adreno/adreno_device.c | 6 +- drivers/gpu/drm/msm/msm_ringbuffer.c| 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 136 +++- drivers/gpu/drm/v3d/v3d_sched.c | 10 +- include/drm/gpu_scheduler.h | 14 +- 10 files changed, 113 insertions(+), 89 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index f60753f97ac5..9c2a10aeb0b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1489,9 +1489,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) for (i = 0; i < AMDGPU_MAX_RINGS; i++) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !ring->sched.ready) continue; - kthread_park(ring->sched.thread); + drm_sched_run_wq_stop(>sched); It would be good to split out adding of these wrappers (including adding one for ring->sched.thread/ready) to a standalong preceding patch. That way at least some mechanical changes to various drivers would be separated from functional changes. Also, perhaps do not have the wq in the name if it is not really needed to be verbose with the underlying implementation like that? Like would drm_sched_run/pause. Or even __drm_sched_start/stop, dunno, just an idea. } seq_printf(m, "run ib test:\n"); @@ -1505,9 +1505,9 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused) for (i = 0; i < AMDGPU_MAX_RINGS; i++) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !ring->sched.ready) continue; - kthread_unpark(ring->sched.thread); + drm_sched_run_wq_start(>sched); } up_write(>reset_domain->sem); @@ -1727,7 +1727,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) ring = adev->rings[val]; - if (!ring || !ring->funcs->preempt_ib || !ring->sched.thread) + if (!ring || !ring->funcs->preempt_ib || !ring->sched.ready) return -EINVAL; /* the last preemption failed */ @@ -1745,7 +1745,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) goto pro_end; /* stop the scheduler */ - kthread_park(ring->sched.thread); + drm_sched_run_wq_stop(>sched); /* preempt the IB */ r = amdgpu_ring_preempt_ib(ring); @@ -1779,7 +1779,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) failure: /* restart the scheduler */ - kthread_unpark(ring->sched.thread); + drm_sched_run_wq_start(>sched); up_read(>reset_domain->sem); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fac9312b1695..00c9c03c8f94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2364,7 +2364,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) break; } - r = drm_sched_init(>sched, _sched_ops, +
Re: [PATCH 5/5] drm/i915: Implement fdinfo memory stats printing
On 03/08/2023 06:15, Iddamsetty, Aravind wrote: On 27-07-2023 15:43, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. Objects with multiple possible placements are reported in multiple regions for total and shared sizes, while other categories are counted only for the currently active region. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark > --- drivers/gpu/drm/i915/i915_drm_client.c | 85 ++ 1 file changed, 85 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index a61356012df8..9e7a6075ee25 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) +{ + struct intel_memory_region *mr; + u64 sz = obj->base.size; + enum intel_region_id id; + unsigned int i; + + /* Attribute size and shared to all possible memory regions. */ + for (i = 0; i < obj->mm.n_placements; i++) { + mr = obj->mm.placements[i]; + id = mr->id; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + /* Attribute other categories to only the current region. */ + mr = obj->mm.region; + if (mr) + id = mr->id; + else + id = INTEL_REGION_SMEM; + + if (!obj->mm.n_placements) { I guess we do not expect to have n_placements set to public objects, is that right? I think they are the only ones which can have placements. It is via I915_GEM_CREATE_EXT_MEMORY_REGIONS userspace is able to create them. My main conundrum in this patch is a few lines above, the loop which adds shared and private. Question is, if an object can be either smem or lmem, how do we want to report it? This patch adds the size for all possible regions and resident and active only to the currently active. But perhaps that is wrong. Maybe I should change it is only against the active region and multiple regions are just ignored. Then if object is migrated do access patterns or memory pressure, the total size would migrate too. I think I was trying to achieve something here (have more visibility on what kind of backing store clients are allocating) which maybe does not work to well with the current categories. Namely if userspace allocates say one 1MiB object with placement in either smem or lmem, and it is currently resident in lmem, I wanted it to show as: total-smem: 1 MiB resident-smem: 0 total-lmem: 1 MiB resident-lmem: 1 MiB To constantly show how in theory client could be using memory from either region. Maybe that is misleading and should instead be: total-smem: 0 resident-smem: 0 total-lmem: 1 MiB resident-lmem: 1 MiB ? And then if/when the same object gets migrated to smem it changes to (lets assume it is also not resident any more but got swapped out): total-smem: 1 MiB resident-smem: 0 total-lmem: 0 resident-lmem: 0 Regards, Tvrtko + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + dma_resv_usage_rw(true))) + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry(>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, &g
[PULL] drm-intel-fixes
Hi Dave, Daniel, Some fixes for the 6.5 RC this week: one for GVT display I2C handling, which came via gvt-fixes merge, one for premature freeing of request memory, and finally one fix for Gen12 AUX invalidatation flow to correctly align it with the documented sequence. Regards, Tvrtko drm-intel-fixes-2023-08-03: - Fix bug in getting msg length in AUX CH registers handler [gvt] (Yan Zhao) - Gen12 AUX invalidation fixes [gt] (Andi Shyti, Jonathan Cavitt) - Fix premature release of request's reusable memory (Janusz Krzysztofik) - Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into drm-intel-fixes (Tvrtko Ursulin) The following changes since commit 5d0c230f1de8c7515b6567d9afba1f196fb4e2f4: Linux 6.5-rc4 (2023-07-30 13:23:47 -0700) are available in the Git repository at: git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-08-03 for you to fetch changes up to 0bc057eae2610c275361766a064a23cc2758f3ff: Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into drm-intel-fixes (2023-08-02 08:14:57 +0100) - Fix bug in getting msg length in AUX CH registers handler [gvt] (Yan Zhao) - Gen12 AUX invalidation fixes [gt] (Andi Shyti, Jonathan Cavitt) - Fix premature release of request's reusable memory (Janusz Krzysztofik) - Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into drm-intel-fixes (Tvrtko Ursulin) Andi Shyti (5): drm/i915/gt: Cleanup aux invalidation registers drm/i915: Add the gen12_needs_ccs_aux_inv helper drm/i915/gt: Rename flags with bit_group_X according to the datasheet drm/i915/gt: Enable the CCS_FLUSH bit in the pipe control and in the CS drm/i915/gt: Support aux invalidation on all engines Janusz Krzysztofik (1): drm/i915: Fix premature release of request's reusable memory Jonathan Cavitt (2): drm/i915/gt: Ensure memory quiesced before invalidation drm/i915/gt: Poll aux invalidation register bit on invalidation Tvrtko Ursulin (1): Merge tag 'gvt-fixes-2023-08-02' of https://github.com/intel/gvt-linux into drm-intel-fixes Yan Zhao (1): drm/i915/gvt: Fix bug in getting msg length in AUX CH registers handler drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 140 ++- drivers/gpu/drm/i915/gt/gen8_engine_cs.h | 21 ++-- drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 2 + drivers/gpu/drm/i915/gt/intel_gt_regs.h | 16 +-- drivers/gpu/drm/i915/gt/intel_lrc.c | 17 +--- drivers/gpu/drm/i915/gvt/edid.c | 2 +- drivers/gpu/drm/i915/i915_active.c | 99 +-- drivers/gpu/drm/i915/i915_request.c | 11 +++ 8 files changed, 199 insertions(+), 109 deletions(-)
Re: [PATCH 16/17] cgroup/drm: Expose memory stats
One additional thought on one sub-topic: On 27/07/2023 18:08, Tvrtko Ursulin wrote: [snip] For something like this, you would probably want it to work inside the drm scheduler first. Presumably, this can be done by setting a weight on each runqueue, and perhaps adding a callback to update one for a running queue. Calculating the weights hierarchically might be fun.. It is not needed to work in drm scheduler first. In fact drm scheduler based drivers can plug into what I have since it already has the notion of scheduling priorities. They would only need to implement a hook which allow the cgroup controller to query client GPU utilisation and another to received the over budget signal. Amdgpu and msm AFAIK could be easy candidates because they both support per client utilisation and priorities. Looks like I need to put all this info back into the cover letter. Also, hierarchic weights and time budgets are all already there. What could be done later is make this all smarter and respect the time budget with more precision. That would however, in many cases including Intel, require co-operation with the firmware. In any case it is only work in the implementation, while the cgroup control interface remains the same. I have taken a look at how the rest of cgroup controllers change ownership when moved to a different cgroup, and the answer was: not at all. If we attempt to create the scheduler controls only on the first time the fd is used, you could probably get rid of all the tracking. Can you send a CPU file descriptor from process A to process B and have CPU usage belonging to process B show up in process' A cgroup, or vice-versa? Nope, I am not making any sense, am I? My point being it is not like-to-like, model is different. No ownership transfer would mean in wide deployments all GPU utilisation would be assigned to Xorg and so there is no point to any of this. No way to throttle a cgroup with un-important GPU clients for instance. If you just grab the current process' cgroup when a drm_sched_entity is created, you don't have everything charged to X.org. No need for complicated ownership tracking in drm_file. The same equivalent should be done in i915 as well when a context is created as it's not using the drm scheduler. Okay so essentially nuking the concept of DRM clients belongs to one cgroup and instead tracking at the context level. That is an interesting idea. I suspect implementation could require somewhat generalizing the concept of an "execution context", or at least expressing it via the DRM cgroup controller. I can give this a spin, or at least some more detailed thought, once we close on a few more details regarding charging in general. I didn't get much time to brainstorm this just yet, only one downside randomly came to mind later - with this approach for i915 we wouldn't correctly attribute any GPU activity done in the receiving process against our default contexts. Those would still be accounted to the sending process. How much problem in practice that would be remains to be investigated, including if it applies to other drivers too. If there is a good amount of deployed userspace which use the default context, then it would be a bit messy. Regards, Tvrtko *) For non DRM and non i915 people, default context is a GPU submission context implicitly created during the device node open. It always remains valid, including in the receiving process if SCM_RIGHTS is used.
Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling
On 28/07/2023 08:14, Yang, Fei wrote: [snip] @@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) return false; /* - * For objects created by userspace through GEM_CREATE with pat_index - * set by set_pat extension, i915_gem_object_has_cache_level() will - * always return true, because the coherency of such object is managed i915_gem_object_has_cache_level() always return true means this function always return false. - * by userspace. Othereise the call here would fall back to checking - * whether the object is un-cached or write-through. + * Always flush cache for UMD objects with PAT index set. (obj->pat_set_by_user == true) indicates UMD knows how to handle the coherency, forcing clflush in KMD would be redundant. For Meteorlake I made gpu_write_needs_clflush() always return false anyway. Could you please submit a patch with kerneldoc for i915_drm.h explaining what the set domain ioctl is expected to do when set pat extension is used? With the focus on the use cases of how userspace is managing coherency using it, or it isn't, or what. */ - return !(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) || - i915_gem_object_has_cache_level(obj, I915_CACHE_WT)); + if (obj->pat_set_by_user) + return true; return false; Oops, thank you! I did warn in the cover letter I was getting confused by boolean logic conversions, cross-referencing three versions, and extracting the pat_set_by_user to call sites. :) + + /* + * Fully coherent cached access may end up with data in the CPU cache + * which hasn't hit memory yet. + */ + return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) && +i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W); Why checking COH2W here? The logic was, if UC or WT return false, otherwise return true. So, as long as cache_mode is WB, it's sufficient to say true here, right? I was trying to penetrate the reason behind the check. Original code was: return !(obj->cache_level == I915_CACHE_NONE || obj->cache_level == I915_CACHE_WT); Which is equivalent to "is it WB", right? (Since it matches on both old LLC flavours.) Which I thought, in the context of this function, is supposed to answer the question of "can there be data in the shared cache written by the GPU but not committed to RAM yet". And then I thought that can only ever happen with 2-way coherency. Otherwise GPU writes never end up in the CPU cache. Did I get that wrong? Maybe I have.. Regards, Tvrtko
Re: [RFC 7/8] drm/i915: Lift the user PAT restriction from use_cpu_reloc
On 28/07/2023 01:09, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:55:03PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, we can refine the check in use_cpu_reloc() to not reject the uncached PAT if it was set by userspace. Instead it can decide based on the presence of full coherency which should be functionally equivalent on legacy platforms. We can ignore WT since it is only used by the display, and we can ignore Meteorlake since it will fail on the existing "has_llc" condition before the object cache mode check. Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 9d6e49c8a4c6..f74b33670bad 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -640,16 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache, if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) return false; - /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, i915_gem_object_has_cache_level() always -* return true, otherwise the call would fall back to checking whether -* the object is un-cached. -*/ return (cache->has_llc || obj->cache_dirty || - !(obj->pat_set_by_user || - i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC))); + i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W)); My understanding of relocations is minimal, but does 2W actually matter here (CPU snooping GPU caches)? I would have expected only 1W coherency to be necessary (GPU snooping CPU caches)? I struggled with this one. Original code was: return (cache->has_llc || obj->cache_dirty || obj->cache_level != I915_CACHE_NONE); And I struggled to figure out the intent. It is not "don't do CPU relocations for uncached" because it will do them when LLC or dirty regardless. You could be right.. can we interpret it as any mode apart from uncached was viewed as coherent for CPU writes being seen by the GPU? In which case should/could it be based on I915_BO_CACHE_COHERENT_FOR_WRITE? Regards, Tvrtko Matt } static int eb_reserve_vma(struct i915_execbuffer *eb, -- 2.39.2
Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling
Forgot one part of your reply: On 28/07/2023 00:57, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few sub- optimal design decisions which this patch tries to improve upon. Principal change here is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level and make the involved code path clearer. To achieve this we replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). In this way we are able to express the differences between different write-back mode coherency settings on Meteorlake, which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. We can also replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. v4: * Fix intel_device_info->cache_modes array size. (Matt) * Boolean cache mode and flags query. (Matt) * Reduce number of cache macros with some macro magic. * One more checkpatch fix. * Tweak tables to show legacy and Gen12 WB is fully coherent. Signed-off-by: Tvrtko Ursulin References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 60 + drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 20 +-- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 2 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 3 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 25 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/i915_cache.c | 89 +++-- drivers/gpu/drm/i915/i915_cache.h | 70 ++- drivers/gpu/drm/i915/i915_debugfs.c | 53 ++-- drivers/gpu/drm/i915/i915_driver.c| 4 +- drivers/gpu/drm/i915/i915_gem.c | 13 -- drivers/gpu/drm/i915/i915_pci.c | 84 +++-- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- drivers/gpu/drm/i915/selftests/igt_spinner.c | 2 +- .../gpu/drm/i915/selftests/mock_gem_device.c | 14 +-- 36 files changed, 391 insertions(+), 367 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 57db9c581bf6..c15f83de33af 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -8,6 +8,7 @@ #include "display/intel_frontbuffer.h" #include "gt/intel_gt.h" +#include "i915_cache.h" #include "i915_drv.h" #include "i915_gem_clflush.h" #include "i915_gem_domain.h" @@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) return false; /* -* For objects created by userspace through GEM_CREATE with pat_index -*
Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling
On 28/07/2023 01:17, Matt Roper wrote: On Thu, Jul 27, 2023 at 04:57:53PM -0700, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few sub- optimal design decisions which this patch tries to improve upon. Principal change here is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level and make the involved code path clearer. To achieve this we replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). In this way we are able to express the differences between different write-back mode coherency settings on Meteorlake, which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. We can also replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. v4: * Fix intel_device_info->cache_modes array size. (Matt) * Boolean cache mode and flags query. (Matt) * Reduce number of cache macros with some macro magic. * One more checkpatch fix. * Tweak tables to show legacy and Gen12 WB is fully coherent. Signed-off-by: Tvrtko Ursulin References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 60 + drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 20 +-- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 2 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 3 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 25 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/i915_cache.c | 89 +++-- drivers/gpu/drm/i915/i915_cache.h | 70 ++- drivers/gpu/drm/i915/i915_debugfs.c | 53 ++-- drivers/gpu/drm/i915/i915_driver.c| 4 +- drivers/gpu/drm/i915/i915_gem.c | 13 -- drivers/gpu/drm/i915/i915_pci.c | 84 +++-- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- drivers/gpu/drm/i915/selftests/igt_spinner.c | 2 +- .../gpu/drm/i915/selftests/mock_gem_device.c | 14 +-- 36 files changed, 391 insertions(+), 367 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 57db9c581bf6..c15f83de33af 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -8,6 +8,7 @@ #include "display/intel_frontbuffer.h" #include "gt/intel_gt.h" +#include "i915_cache.h" #include "i915_drv.h" #include "i915_gem_clflush.h" #include "i915_gem_domain.h" @@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) return false; /* -* For objects created by userspace through GEM_CREATE with
Re: [RFC 5/8] drm/i915: Improve the vm_fault_gtt user PAT index restriction
On 28/07/2023 01:04, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:55:01PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, we can refine the check in vm_fault_gtt() to not reject the uncached PAT if it was set by userspace on a snoopable platform. Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 14 +++--- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index cd7f8ded0d6f..9aa6ecf68432 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -382,17 +382,9 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) goto err_reset; } - /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, coherency is managed by userspace, make -* sure we don't fail handling the vm fault by calling -* i915_gem_object_has_cache_level() which always return true for such -* objects. Otherwise this helper function would fall back to checking -* whether the object is un-cached. -*/ - if (!((obj->pat_set_by_user || - i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)) || - HAS_LLC(i915))) { + /* Access to snoopable pages through the GTT is incoherent. */ This comment was removed in the previous patch, but now it came back here. Should we have just left it be in the previous patch? Oops yes, fumble when splitting the single patch into this series. I'm not really clear on what it means either. Are we using "GTT" as shorthand to refer to the aperture here? It is about CPU mmap access so I think so. Original code was: /* Access to snoopable pages through the GTT is incoherent. */ if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(i915)) { ret = -EFAULT; goto err_unpin; } Which was disallowing anything not uncached on snoopable platforms. So I made it equivalent to that: /* Access to snoopable pages through the GTT is incoherent. */ if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) && !HAS_LLC(i915)) { ret = -EFAULT; goto err_unpin; } Should be like-for-like assuming PAT-to-cache-mode tables are all good. On Meteorlake it is no change in behaviour either way due !HAS_LLC. Regards, Tvrtko Matt + if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) && + !HAS_LLC(i915)) { ret = -EFAULT; goto err_unpin; } -- 2.39.2
Re: [RFC 4/8] drm/i915: Refactor PAT/object cache handling
On 28/07/2023 00:57, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:55:00PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few sub- optimal design decisions which this patch tries to improve upon. Principal change here is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level and make the involved code path clearer. To achieve this we replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). In this way we are able to express the differences between different write-back mode coherency settings on Meteorlake, which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. We can also replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. v4: * Fix intel_device_info->cache_modes array size. (Matt) * Boolean cache mode and flags query. (Matt) * Reduce number of cache macros with some macro magic. * One more checkpatch fix. * Tweak tables to show legacy and Gen12 WB is fully coherent. Signed-off-by: Tvrtko Ursulin References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 60 + drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 20 +-- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 2 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 3 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 25 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/i915_cache.c | 89 +++-- drivers/gpu/drm/i915/i915_cache.h | 70 ++- drivers/gpu/drm/i915/i915_debugfs.c | 53 ++-- drivers/gpu/drm/i915/i915_driver.c| 4 +- drivers/gpu/drm/i915/i915_gem.c | 13 -- drivers/gpu/drm/i915/i915_pci.c | 84 +++-- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- drivers/gpu/drm/i915/selftests/igt_spinner.c | 2 +- .../gpu/drm/i915/selftests/mock_gem_device.c | 14 +-- 36 files changed, 391 insertions(+), 367 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 57db9c581bf6..c15f83de33af 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -8,6 +8,7 @@ #include "display/intel_frontbuffer.h" #include "gt/intel_gt.h" +#include "i915_cache.h" #include "i915_drv.h" #include "i915_gem_clflush.h" #include "i915_gem_domain.h" @@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) return false; /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, i915_gem_objec
Re: [RFC 3/8] drm/i915: Cache PAT index used by the driver
On 27/07/2023 23:44, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:54:59PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Eliminate a bunch of runtime calls to i915_gem_get_pat_index() by caching the interesting PAT indices in struct drm_i915_private. They are static per platfrom so no need to consult a function every time. Signed-off-by: Tvrtko Ursulin Cc: Matt Roper Cc: Fei Yang --- drivers/gpu/drm/i915/Makefile | 1 + .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +-- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 7 ++--- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 26 --- .../gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +-- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 4 +-- drivers/gpu/drm/i915/gt/intel_ggtt.c | 8 ++ drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +++- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +++ drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +++--- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 ++-- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 ++ drivers/gpu/drm/i915/i915_cache.c | 18 + drivers/gpu/drm/i915/i915_cache.h | 13 ++ drivers/gpu/drm/i915/i915_driver.c| 3 +++ drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 8 ++ drivers/gpu/drm/i915/i915_gpu_error.c | 8 ++ drivers/gpu/drm/i915/selftests/i915_gem.c | 5 +--- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +-- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 11 +++- .../drm/i915/selftests/intel_memory_region.c | 4 +-- .../gpu/drm/i915/selftests/mock_gem_device.c | 2 ++ 24 files changed, 89 insertions(+), 91 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_cache.c create mode 100644 drivers/gpu/drm/i915/i915_cache.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index c5fc91cd58e7..905a51a16588 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -35,6 +35,7 @@ subdir-ccflags-y += -I$(srctree)/$(src) # core driver code i915-y += i915_driver.o \ i915_drm_client.o \ + i915_cache.o \ i915_config.o \ i915_getparam.o \ i915_ioctl.o \ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 5a687a3686bd..0a1d40220020 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1330,8 +1330,7 @@ static void *reloc_iomap(struct i915_vma *batch, ggtt->vm.insert_page(>vm, i915_gem_object_get_dma_address(obj, page), offset, -i915_gem_get_pat_index(ggtt->vm.i915, - I915_CACHE_NONE), +eb->i915->pat_uc, 0); } else { offset += page << PAGE_SHIFT; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 5b0a5cf9a98a..1c8eb806b7d3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -563,11 +563,8 @@ static void dbg_poison(struct i915_ggtt *ggtt, while (size) { void __iomem *s; - ggtt->vm.insert_page(>vm, addr, -ggtt->error_capture.start, -i915_gem_get_pat_index(ggtt->vm.i915, - I915_CACHE_NONE), -0); + ggtt->vm.insert_page(>vm, addr, ggtt->error_capture.start, +ggtt->vm.i915->pat_uc, 0); mb(); s = io_mapping_map_wc(>iomap, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c index 7078af2f8f79..6bd6c239f4ac 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c @@ -58,6 +58,16 @@ i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res, I915_CACHE_NONE; } +static unsigned int +i915_ttm_cache_pat(struct drm_i915_private *i915, struct ttm_resource *res, + struct ttm_tt *ttm) +{ + return ((HAS_LLC(i915) || HAS_SNOOP(i915)) && + !i915_ttm_gtt_binds_lmem(res) && This matches the existing logic of i915_ttm_cache_level(), but do you know why LMEM buffers are always set to uncached? I don't understand that part. I am not sure - was thinking about that myself - li
Re: [PATCH 2/2] drm/v3d: Expose the total GPU usage stats on debugfs
On 28/07/2023 12:25, Maira Canal wrote: Hi, On 7/28/23 07:16, Tvrtko Ursulin wrote: Hi, On 27/07/2023 15:23, Maíra Canal wrote: The previous patch exposed the accumulated amount of active time per client for each V3D queue. But this doesn't provide a global notion of the GPU usage. Therefore, provide the accumulated amount of active time for each V3D queue (BIN, RENDER, CSD, TFU and CACHE_CLEAN), considering all the jobs submitted to the queue, independent of the client. This data is exposed through the debugfs interface, so that if the interface is queried at two different points of time the usage percentage of each of the queues can be calculated. Just passing observation - I've noticed a mismatch between fdinfo and debugfs in terms of ABI stability and production availability. Not sure if it matters for your intended use cases, just saying that if you plan to have an user facing tool similar to what we have in intel_gpu_top, debugfs may not be the best choice. Do you have a suggestion of a better interface that could be used to expose this data? It would be nice to have something generic, similar to fdinfo, to expose global GPU stats. This way we could expose global GPU stats on gputop, which would be great. I think there is at least two options. With i915 we use perf/PMU, drawback (or not, depends on the view point) is that it requires CAP_SYS_PERFMON. Fits well for exposing global GPU hardware counters. You could go the sysfs route, which would be ABI stable and available in production. This could either be attempted to be somewhat DRM standardized (ala fdinfo), or driver specific. Maybe someone has more ideas. Regards, Tvrtko Best Regards, - Maíra Regards, Tvrtko Co-developed-by: Jose Maria Casanova Crespo Signed-off-by: Jose Maria Casanova Crespo Signed-off-by: Maíra Canal --- drivers/gpu/drm/v3d/v3d_debugfs.c | 27 +++ drivers/gpu/drm/v3d/v3d_drv.h | 3 +++ drivers/gpu/drm/v3d/v3d_gem.c | 5 - drivers/gpu/drm/v3d/v3d_irq.c | 24 drivers/gpu/drm/v3d/v3d_sched.c | 13 - 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_debugfs.c b/drivers/gpu/drm/v3d/v3d_debugfs.c index 330669f51fa7..3b7329343649 100644 --- a/drivers/gpu/drm/v3d/v3d_debugfs.c +++ b/drivers/gpu/drm/v3d/v3d_debugfs.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -236,11 +237,37 @@ static int v3d_measure_clock(struct seq_file *m, void *unused) return 0; } +static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused) +{ + struct drm_debugfs_entry *entry = m->private; + struct drm_device *dev = entry->dev; + struct v3d_dev *v3d = to_v3d_dev(dev); + enum v3d_queue queue; + u64 timestamp = local_clock(); + u64 active_runtime; + + seq_printf(m, "timestamp: %llu\n", timestamp); + + for (queue = 0; queue < V3D_MAX_QUEUES; queue++) { + if (v3d->queue[queue].start_ns) + active_runtime = timestamp - v3d->queue[queue].start_ns; + else + active_runtime = 0; + + seq_printf(m, "%s: %llu ns\n", + v3d_queue_to_string(queue), + v3d->queue[queue].enabled_ns + active_runtime); + } + + return 0; +} + static const struct drm_debugfs_info v3d_debugfs_list[] = { {"v3d_ident", v3d_v3d_debugfs_ident, 0}, {"v3d_regs", v3d_v3d_debugfs_regs, 0}, {"measure_clock", v3d_measure_clock, 0}, {"bo_stats", v3d_debugfs_bo_stats, 0}, + {"gpu_usage", v3d_debugfs_gpu_usage, 0}, }; void diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index ee5e12d0db1c..b41b32ecd991 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -38,6 +38,9 @@ struct v3d_queue_state { u64 fence_context; u64 emit_seqno; + + u64 start_ns; + u64 enabled_ns; }; /* Performance monitor object. The perform lifetime is controlled by userspace diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 40ed0c7c3fad..630ea2db8f8f 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -1014,8 +1014,11 @@ v3d_gem_init(struct drm_device *dev) u32 pt_size = 4096 * 1024; int ret, i; - for (i = 0; i < V3D_MAX_QUEUES; i++) + for (i = 0; i < V3D_MAX_QUEUES; i++) { v3d->queue[i].fence_context = dma_fence_context_alloc(1); + v3d->queue[i].start_ns = 0; + v3d->queue[i].enabled_ns = 0; + } spin_lock_init(>mm_lock); spin_lock_init(>job_lock); diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index c898800ae9c2..be4ff7559309 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -102,9 +102,13 @@ v3d_irq(in
Re: [PATCH 2/2] drm/v3d: Expose the total GPU usage stats on debugfs
Hi, On 27/07/2023 15:23, Maíra Canal wrote: The previous patch exposed the accumulated amount of active time per client for each V3D queue. But this doesn't provide a global notion of the GPU usage. Therefore, provide the accumulated amount of active time for each V3D queue (BIN, RENDER, CSD, TFU and CACHE_CLEAN), considering all the jobs submitted to the queue, independent of the client. This data is exposed through the debugfs interface, so that if the interface is queried at two different points of time the usage percentage of each of the queues can be calculated. Just passing observation - I've noticed a mismatch between fdinfo and debugfs in terms of ABI stability and production availability. Not sure if it matters for your intended use cases, just saying that if you plan to have an user facing tool similar to what we have in intel_gpu_top, debugfs may not be the best choice. Regards, Tvrtko Co-developed-by: Jose Maria Casanova Crespo Signed-off-by: Jose Maria Casanova Crespo Signed-off-by: Maíra Canal --- drivers/gpu/drm/v3d/v3d_debugfs.c | 27 +++ drivers/gpu/drm/v3d/v3d_drv.h | 3 +++ drivers/gpu/drm/v3d/v3d_gem.c | 5 - drivers/gpu/drm/v3d/v3d_irq.c | 24 drivers/gpu/drm/v3d/v3d_sched.c | 13 - 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_debugfs.c b/drivers/gpu/drm/v3d/v3d_debugfs.c index 330669f51fa7..3b7329343649 100644 --- a/drivers/gpu/drm/v3d/v3d_debugfs.c +++ b/drivers/gpu/drm/v3d/v3d_debugfs.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -236,11 +237,37 @@ static int v3d_measure_clock(struct seq_file *m, void *unused) return 0; } +static int v3d_debugfs_gpu_usage(struct seq_file *m, void *unused) +{ + struct drm_debugfs_entry *entry = m->private; + struct drm_device *dev = entry->dev; + struct v3d_dev *v3d = to_v3d_dev(dev); + enum v3d_queue queue; + u64 timestamp = local_clock(); + u64 active_runtime; + + seq_printf(m, "timestamp: %llu\n", timestamp); + + for (queue = 0; queue < V3D_MAX_QUEUES; queue++) { + if (v3d->queue[queue].start_ns) + active_runtime = timestamp - v3d->queue[queue].start_ns; + else + active_runtime = 0; + + seq_printf(m, "%s: %llu ns\n", + v3d_queue_to_string(queue), + v3d->queue[queue].enabled_ns + active_runtime); + } + + return 0; +} + static const struct drm_debugfs_info v3d_debugfs_list[] = { {"v3d_ident", v3d_v3d_debugfs_ident, 0}, {"v3d_regs", v3d_v3d_debugfs_regs, 0}, {"measure_clock", v3d_measure_clock, 0}, {"bo_stats", v3d_debugfs_bo_stats, 0}, + {"gpu_usage", v3d_debugfs_gpu_usage, 0}, }; void diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index ee5e12d0db1c..b41b32ecd991 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -38,6 +38,9 @@ struct v3d_queue_state { u64 fence_context; u64 emit_seqno; + + u64 start_ns; + u64 enabled_ns; }; /* Performance monitor object. The perform lifetime is controlled by userspace diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 40ed0c7c3fad..630ea2db8f8f 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -1014,8 +1014,11 @@ v3d_gem_init(struct drm_device *dev) u32 pt_size = 4096 * 1024; int ret, i; - for (i = 0; i < V3D_MAX_QUEUES; i++) + for (i = 0; i < V3D_MAX_QUEUES; i++) { v3d->queue[i].fence_context = dma_fence_context_alloc(1); + v3d->queue[i].start_ns = 0; + v3d->queue[i].enabled_ns = 0; + } spin_lock_init(>mm_lock); spin_lock_init(>job_lock); diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index c898800ae9c2..be4ff7559309 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -102,9 +102,13 @@ v3d_irq(int irq, void *arg) struct v3d_fence *fence = to_v3d_fence(v3d->bin_job->base.irq_fence); struct v3d_file_priv *file = v3d->bin_job->base.file->driver_priv; + u64 runtime = local_clock() - file->start_ns[V3D_BIN]; - file->enabled_ns[V3D_BIN] += local_clock() - file->start_ns[V3D_BIN]; file->start_ns[V3D_BIN] = 0; + v3d->queue[V3D_BIN].start_ns = 0; + + file->enabled_ns[V3D_BIN] += runtime; + v3d->queue[V3D_BIN].enabled_ns += runtime; trace_v3d_bcl_irq(>drm, fence->seqno); dma_fence_signal(>base); @@ -115,9 +119,13 @@ v3d_irq(int irq, void *arg) struct v3d_fence *fence =
Re: CPU overhead for drm fdinfo stats
On 27/07/2023 21:58, Alex Deucher wrote: We have a number of customers using these stats, but the issue that keeps coming up is the CPU overhead to gather them, particularly on systems with hundreds of processes using the GPU. Has anyone given any thought to having a single interface to get this information for the entire GPU in one place? Could I have a framed told you so certificate please? :D Well at least it depends on how much CPU overhead would your users be happy to eliminate and how much to keep. So maybe no need for that certificate just yet. I was raising the issue of exponential complexity of walking "total number of processes" x "total number of file descriptors" on a system from the inception of fdinfo. So for that issue the idea was to perhaps expose a list of pids with DRM fds open somewhere, maybe sysfs. That would eliminate walking _all_ processes and trying to parse any their file descriptor. But it would still require walking all file descriptors belonging to processes with DRM fds open. If that wouldn't be enough of a saving for your users then no, I am not aware it was discussed. Assuming at least you were suggesting something like "read all fdinfo for all clients" in one blob. Also in sysfs? I think it would be doable by walking the dev->filelist and invoking drm_show_fdinfo() on them. Out of curiosity are they using the fdinfo parsing code from IGT or something of their own? Regards, Tvrtko
Re: [Intel-gfx] [RFC 2/8] drm/i915: Split PTE encode between Gen12 and Meteorlake
On 27/07/2023 23:25, Matt Roper wrote: On Thu, Jul 27, 2023 at 03:54:58PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin No need to run extra instructions which will never trigger on platforms before Meteorlake. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index c8568e5d1147..862ac1d2de25 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -63,6 +63,30 @@ static u64 gen12_pte_encode(dma_addr_t addr, { gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW; + if (unlikely(flags & PTE_READ_ONLY)) + pte &= ~GEN8_PAGE_RW; + + if (flags & PTE_LM) + pte |= GEN12_PPGTT_PTE_LM; + + if (pat_index & BIT(0)) + pte |= GEN12_PPGTT_PTE_PAT0; + + if (pat_index & BIT(1)) + pte |= GEN12_PPGTT_PTE_PAT1; + + if (pat_index & BIT(2)) + pte |= GEN12_PPGTT_PTE_PAT2; + + return pte; +} + +static u64 mtl_pte_encode(dma_addr_t addr, + unsigned int pat_index, + u32 flags) +{ + gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW; + Would it be more readable to start with gen8_pte_t pte = gen12_pte_encode(addr, pat_index, flags); and then |-in only the MTL-specific bit(s) as appropriate? if (unlikely(flags & PTE_READ_ONLY)) pte &= ~GEN8_PAGE_RW; @@ -995,6 +1019,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt, */ ppgtt->vm.alloc_scratch_dma = alloc_pt_dma; + if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) + ppgtt->vm.pte_encode = mtl_pte_encode; if (GRAPHICS_VER(gt->i915) >= 12) ppgtt->vm.pte_encode = gen12_pte_encode; I think you wanted 'else if' here. Otherwise you clobber the MTL function pointer. Doh this was a strong fail.. Yes and yes.. I even had it like you suggest in that patch I mentioned to you earlier.. https://patchwork.freedesktop.org/patch/546013/?series=120341=2. Do you have an opinion on that one perhaps? Thanks, Tvrtko
Re: [PATCH] drm/i915/gem: Add check for bitmap_zalloc()
Hi, On 28/07/2023 02:58, Jiasheng Jiang wrote: Add the check for the return value of bitmap_zalloc() in order to guarantee the success of the allocation. Fixes: e9b73c67390a ("drm/i915: Reduce memory pressure during shrinker by preallocating swizzle pages") Signed-off-by: Jiasheng Jiang --- drivers/gpu/drm/i915/gem/i915_gem_tiling.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c index a049ca0b7980..e9cf99d95966 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_tiling.c @@ -311,6 +311,11 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, if (!obj->bit_17) { obj->bit_17 = bitmap_zalloc(obj->base.size >> PAGE_SHIFT, GFP_KERNEL); + if (!obj->bit_17) { + i915_gem_object_unlock(obj); + i915_gem_object_release_mmap_gtt(obj); + return -ENOMEM; + } Hm the comment few lines above says: /* Try to preallocate memory required to save swizzling on put-pages */ Lets emphasis the *try* for now. Then once the obj->bit_17 is attempted to be used we have this: i915_gem_object_save_bit_17_swizzle(..) { ... if (obj->bit_17 == NULL) { obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL); if (obj->bit_17 == NULL) { drm_err(obj->base.dev, "Failed to allocate memory for bit 17 record\n"); return; } } So despite this area of the driver being a bit before my time, I'd say it quite possibly works as designed - only *tries* to preallocate but does not have to and can cope with a later failure. Good question might be why wouldn't it be better to do what you suggest. Trade off would be between failing the ioctl and possibly crashing the application, versus visual corruption if at use time allocation fails. The whole swizzling thing also only applies to old GPUs, stuff before Broadwell, which itself was released in 2014. So it is tempting to err on the side of caution and leave it as is. I'll mull it over in the background, or maybe someone else will have an opinion too. Regards, Tvrtko } } else { bitmap_free(obj->bit_17);
Re: [PATCH 16/17] cgroup/drm: Expose memory stats
On 27/07/2023 12:54, Maarten Lankhorst wrote: Hey, On 2023-07-26 13:41, Tvrtko Ursulin wrote: On 26/07/2023 11:14, Maarten Lankhorst wrote: Hey, On 2023-07-22 00:21, Tejun Heo wrote: On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote: $ cat drm.memory.stat card0 region=system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 Data is generated on demand for simplicty of implementation ie. no running totals are kept or accounted during migrations and such. Various optimisations such as cheaper collection of data are possible but deliberately left out for now. Overall, the feature is deemed to be useful to container orchestration software (and manual management). Limits, either soft or hard, are not envisaged to be implemented on top of this approach due on demand nature of collecting the stats. So, yeah, if you want to add memory controls, we better think through how the fd ownership migration should work. I've taken a look at the series, since I have been working on cgroup memory eviction. The scheduling stuff will work for i915, since it has a purely software execlist scheduler, but I don't think it will work for GuC (firmware) scheduling or other drivers that use the generic drm scheduler. It actually works - I used to have a blurb in the cover letter about it but apparently I dropped it. Just a bit less well with many clients, since there are fewer priority levels. All that the design requires from the invididual drivers is some way to react to the "you are over budget by this much" signal. The rest is driver and backend specific. What I mean is that this signal may not be applicable since the drm scheduler just schedules jobs that run. Adding a weight might be done in hardware, since it's responsible for scheduling which context gets to run. The over budget signal is useless in that case, and you just need to set a scheduling priority for the hardware instead. The over budget callback lets the driver know its assigned budget and its current utilisation. Already with that data drivers could implement something smarter than what I did in my RFC. So I don't think callback is completely useless even for some smarter implementation which potentially ties into firmware scheduling. Anyway, I maintain this is implementation details. For something like this, you would probably want it to work inside the drm scheduler first. Presumably, this can be done by setting a weight on each runqueue, and perhaps adding a callback to update one for a running queue. Calculating the weights hierarchically might be fun.. It is not needed to work in drm scheduler first. In fact drm scheduler based drivers can plug into what I have since it already has the notion of scheduling priorities. They would only need to implement a hook which allow the cgroup controller to query client GPU utilisation and another to received the over budget signal. Amdgpu and msm AFAIK could be easy candidates because they both support per client utilisation and priorities. Looks like I need to put all this info back into the cover letter. Also, hierarchic weights and time budgets are all already there. What could be done later is make this all smarter and respect the time budget with more precision. That would however, in many cases including Intel, require co-operation with the firmware. In any case it is only work in the implementation, while the cgroup control interface remains the same. I have taken a look at how the rest of cgroup controllers change ownership when moved to a different cgroup, and the answer was: not at all. If we attempt to create the scheduler controls only on the first time the fd is used, you could probably get rid of all the tracking. Can you send a CPU file descriptor from process A to process B and have CPU usage belonging to process B show up in process' A cgroup, or vice-versa? Nope, I am not making any sense, am I? My point being it is not like-to-like, model is different. No ownership transfer would mean in wide deployments all GPU utilisation would be assigned to Xorg and so there is no point to any of this. No way to throttle a cgroup with un-important GPU clients for instance. If you just grab the current process' cgroup when a drm_sched_entity is created, you don't have everything charged to X.org. No need for complicated ownership tracking in drm_file. The same equivalent should be done in i915 as well when a context is created as it's not using the drm scheduler. Okay so essentially nuking the concept of DRM clients belongs to one cgroup and instead tracking at the context level. That is an interesting idea. I suspect implementation could require somewhat generalizing the concept of an "execution context", or at least expressing it via the DRM cgroup controller. I
Re: [PATCH 16/17] cgroup/drm: Expose memory stats
On 27/07/2023 14:42, Maarten Lankhorst wrote: On 2023-07-26 21:44, Tejun Heo wrote: Hello, On Wed, Jul 26, 2023 at 12:14:24PM +0200, Maarten Lankhorst wrote: So, yeah, if you want to add memory controls, we better think through how the fd ownership migration should work. I've taken a look at the series, since I have been working on cgroup memory eviction. The scheduling stuff will work for i915, since it has a purely software execlist scheduler, but I don't think it will work for GuC (firmware) scheduling or other drivers that use the generic drm scheduler. For something like this, you would probably want it to work inside the drm scheduler first. Presumably, this can be done by setting a weight on each runqueue, and perhaps adding a callback to update one for a running queue. Calculating the weights hierarchically might be fun.. I don't have any idea on this front. The basic idea of making high level distribution decisions in core code and letting individual drivers enforce that in a way which fits them the best makes sense to me but I don't know enough to have an opinion here. I have taken a look at how the rest of cgroup controllers change ownership when moved to a different cgroup, and the answer was: not at all. If we For persistent resources, that's the general rule. Whoever instantiates a resource gets to own it until the resource gets freed. There is an exception with the pid controller and there are discussions around whether we want some sort of migration behavior with memcg but yes by and large instantiator being the owner is the general model cgroup follows. attempt to create the scheduler controls only on the first time the fd is used, you could probably get rid of all the tracking. This can be done very easily with the drm scheduler. WRT memory, I think the consensus is to track system memory like normal memory. Stolen memory doesn't need to be tracked. It's kernel only memory, used for internal bookkeeping only. The only time userspace can directly manipulate stolen memory, is by mapping the pinned initial framebuffer to its own address space. The only allocation it can do is when a framebuffer is displayed, and framebuffer compression creates some stolen memory. Userspace is not aware of this though, and has no way to manipulate those contents. So, my dumb understanding: * Ownership of an fd can be established on the first ioctl call and doesn't need to be migrated afterwards. There are no persistent resources to migration on the first call. Yes, keyword is "can". Trouble is migration may or may not happen. One may choose "Plasma X.org" session type in your login manager and all DRM fds would be under Xorg if not migrated. Or one may choose "Plasma Wayland" and migration wouldn't matter. But former is I think has a huge deployed base so that not supporting implicit migration would be a significant asterisk next to the controller. * Memory then can be tracked in a similar way to memcg. Memory gets charged to the initial instantiator and doesn't need to be moved around afterwards. There may be some discrepancies around stolen memory but the magnitude of inaccuracy introduced that way is limited and bound and can be safely ignored. Is that correct? Hey, Yeah mostly, I think we can stop tracking stolen memory. I stopped doing that for Xe, there is literally nothing to control for userspace in there. Right, but for reporting stolen is a red-herring. In this RFC I simply report on all memory regions known by the driver. As I said in the other reply, imagine the keys are 'system' and 'vram0'. Point was just to illustrate multiplicity of regions. Regards, Tvrtko
[RFC 8/8] drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc
From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, we can refine the check in i915_gem_object_can_bypass_llc() to stop assuming any user PAT can bypass the shared cache (if there is any). Instead we can use the absence of I915_BO_CACHE_COHERENT_FOR_WRITE as the criteria, which is set for all caching modes where writes from the CPU side (in this case buffer clears before handing buffers over to userspace) are fully coherent with respect to reads from the GPU. Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_object.c | 18 +++--- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index ec1f0be43d0d..8c4b54bd3911 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -221,12 +221,6 @@ bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj) if (!(obj->flags & I915_BO_ALLOC_USER)) return false; - /* -* Always flush cache for UMD objects at creation time. -*/ - if (obj->pat_set_by_user) - return true; - /* * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it * possible for userspace to bypass the GTT caching bits set by the @@ -239,7 +233,17 @@ bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj) * it, but since i915 takes the stance of always zeroing memory before * handing it to userspace, we need to prevent this. */ - return IS_JSL_EHL(i915); + if (IS_JSL_EHL(i915)) + return true; + + /* +* Any caching mode where writes via CPU cache are not coherent with +* the GPU needs explicit flushing to ensure GPU can not see stale data. +*/ + if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) + return true; + + return false; } static void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) -- 2.39.2
[RFC 7/8] drm/i915: Lift the user PAT restriction from use_cpu_reloc
From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, we can refine the check in use_cpu_reloc() to not reject the uncached PAT if it was set by userspace. Instead it can decide based on the presence of full coherency which should be functionally equivalent on legacy platforms. We can ignore WT since it is only used by the display, and we can ignore Meteorlake since it will fail on the existing "has_llc" condition before the object cache mode check. Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 9d6e49c8a4c6..f74b33670bad 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -640,16 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache, if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) return false; - /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, i915_gem_object_has_cache_level() always -* return true, otherwise the call would fall back to checking whether -* the object is un-cached. -*/ return (cache->has_llc || obj->cache_dirty || - !(obj->pat_set_by_user || - i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC))); + i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W)); } static int eb_reserve_vma(struct i915_execbuffer *eb, -- 2.39.2
[RFC 6/8] drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush
From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, and having also special cased the Meteorlake snooping fully coherent mode, we can remove the user PAT check from gpu_write_needs_clflush(). Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index c15f83de33af..bf3a2fa0e539 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -41,12 +41,6 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) if (IS_METEORLAKE(i915)) return false; - /* -* Always flush cache for UMD objects with PAT index set. -*/ - if (obj->pat_set_by_user) - return true; - /* * Fully coherent cached access may end up with data in the CPU cache * which hasn't hit memory yet. -- 2.39.2
[RFC 5/8] drm/i915: Improve the vm_fault_gtt user PAT index restriction
From: Tvrtko Ursulin Now that i915 understands the caching modes behind PAT indices, we can refine the check in vm_fault_gtt() to not reject the uncached PAT if it was set by userspace on a snoopable platform. Signed-off-by: Tvrtko Ursulin Cc: Fei Yang Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 14 +++--- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index cd7f8ded0d6f..9aa6ecf68432 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -382,17 +382,9 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf) goto err_reset; } - /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, coherency is managed by userspace, make -* sure we don't fail handling the vm fault by calling -* i915_gem_object_has_cache_level() which always return true for such -* objects. Otherwise this helper function would fall back to checking -* whether the object is un-cached. -*/ - if (!((obj->pat_set_by_user || - i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC)) || - HAS_LLC(i915))) { + /* Access to snoopable pages through the GTT is incoherent. */ + if (!i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) && + !HAS_LLC(i915)) { ret = -EFAULT; goto err_unpin; } -- 2.39.2
[RFC 4/8] drm/i915: Refactor PAT/object cache handling
From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few sub- optimal design decisions which this patch tries to improve upon. Principal change here is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level and make the involved code path clearer. To achieve this we replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). In this way we are able to express the differences between different write-back mode coherency settings on Meteorlake, which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. We can also replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. v4: * Fix intel_device_info->cache_modes array size. (Matt) * Boolean cache mode and flags query. (Matt) * Reduce number of cache macros with some macro magic. * One more checkpatch fix. * Tweak tables to show legacy and Gen12 WB is fully coherent. Signed-off-by: Tvrtko Ursulin References: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 60 + drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 117 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 20 +-- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 2 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 3 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 25 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/i915_cache.c | 89 +++-- drivers/gpu/drm/i915/i915_cache.h | 70 ++- drivers/gpu/drm/i915/i915_debugfs.c | 53 ++-- drivers/gpu/drm/i915/i915_driver.c| 4 +- drivers/gpu/drm/i915/i915_gem.c | 13 -- drivers/gpu/drm/i915/i915_pci.c | 84 +++-- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +- drivers/gpu/drm/i915/selftests/igt_spinner.c | 2 +- .../gpu/drm/i915/selftests/mock_gem_device.c | 14 +-- 36 files changed, 391 insertions(+), 367 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 57db9c581bf6..c15f83de33af 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -8,6 +8,7 @@ #include "display/intel_frontbuffer.h" #include "gt/intel_gt.h" +#include "i915_cache.h" #include "i915_drv.h" #include "i915_gem_clflush.h" #include "i915_gem_domain.h" @@ -41,14 +42,17 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) return false; /* -* For objects created by userspace through GEM_CREATE with pat_index -* set by set_pat extension, i915_gem_object_has_cache_level() will -* always return true, because the coherency of such object is managed -* by userspace. Othereise the call here would fal
[RFC 1/8] drm/i915: Skip clflush after GPU writes on Meteorlake
From: Tvrtko Ursulin On Meteorlake CPU cache will not contain stale data after GPU access since write-invalidate protocol is used, which means there is no need to flush before potentially transitioning the buffer to a non-coherent domain. Use the opportunity to documet the situation on discrete too. Signed-off-by: Tvrtko Ursulin Cc: Matt Roper Cc: Fei Yang Cc: Matthew Auld Cc: Thomas Hellström --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index ffddec1d2a76..57db9c581bf6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -24,9 +24,22 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) { struct drm_i915_private *i915 = to_i915(obj->base.dev); + /* +* Discrete GPUs never dirty the CPU cache. +*/ if (IS_DGFX(i915)) return false; + /* +* Cache snooping on Meteorlake is using write-invalidate so GPU writes +* never end up in the CPU cache. +* +* QQQ: Do other snooping platforms behave identicaly and could we +* therefore write this as "if !HAS_LLC(i915) && HAS_SNOOP(i915)"? +*/ + if (IS_METEORLAKE(i915)) + return false; + /* * For objects created by userspace through GEM_CREATE with pat_index * set by set_pat extension, i915_gem_object_has_cache_level() will -- 2.39.2
[RFC 2/8] drm/i915: Split PTE encode between Gen12 and Meteorlake
From: Tvrtko Ursulin No need to run extra instructions which will never trigger on platforms before Meteorlake. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index c8568e5d1147..862ac1d2de25 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -63,6 +63,30 @@ static u64 gen12_pte_encode(dma_addr_t addr, { gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW; + if (unlikely(flags & PTE_READ_ONLY)) + pte &= ~GEN8_PAGE_RW; + + if (flags & PTE_LM) + pte |= GEN12_PPGTT_PTE_LM; + + if (pat_index & BIT(0)) + pte |= GEN12_PPGTT_PTE_PAT0; + + if (pat_index & BIT(1)) + pte |= GEN12_PPGTT_PTE_PAT1; + + if (pat_index & BIT(2)) + pte |= GEN12_PPGTT_PTE_PAT2; + + return pte; +} + +static u64 mtl_pte_encode(dma_addr_t addr, + unsigned int pat_index, + u32 flags) +{ + gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW; + if (unlikely(flags & PTE_READ_ONLY)) pte &= ~GEN8_PAGE_RW; @@ -995,6 +1019,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt, */ ppgtt->vm.alloc_scratch_dma = alloc_pt_dma; + if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) + ppgtt->vm.pte_encode = mtl_pte_encode; if (GRAPHICS_VER(gt->i915) >= 12) ppgtt->vm.pte_encode = gen12_pte_encode; else -- 2.39.2
[RFC 3/8] drm/i915: Cache PAT index used by the driver
From: Tvrtko Ursulin Eliminate a bunch of runtime calls to i915_gem_get_pat_index() by caching the interesting PAT indices in struct drm_i915_private. They are static per platfrom so no need to consult a function every time. Signed-off-by: Tvrtko Ursulin Cc: Matt Roper Cc: Fei Yang --- drivers/gpu/drm/i915/Makefile | 1 + .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 3 +-- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 7 ++--- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 26 --- .../gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +-- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 4 +-- drivers/gpu/drm/i915/gt/intel_ggtt.c | 8 ++ drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +++- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +++ drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +++--- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 ++-- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 ++ drivers/gpu/drm/i915/i915_cache.c | 18 + drivers/gpu/drm/i915/i915_cache.h | 13 ++ drivers/gpu/drm/i915/i915_driver.c| 3 +++ drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 8 ++ drivers/gpu/drm/i915/i915_gpu_error.c | 8 ++ drivers/gpu/drm/i915/selftests/i915_gem.c | 5 +--- .../gpu/drm/i915/selftests/i915_gem_evict.c | 4 +-- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 11 +++- .../drm/i915/selftests/intel_memory_region.c | 4 +-- .../gpu/drm/i915/selftests/mock_gem_device.c | 2 ++ 24 files changed, 89 insertions(+), 91 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_cache.c create mode 100644 drivers/gpu/drm/i915/i915_cache.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index c5fc91cd58e7..905a51a16588 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -35,6 +35,7 @@ subdir-ccflags-y += -I$(srctree)/$(src) # core driver code i915-y += i915_driver.o \ i915_drm_client.o \ + i915_cache.o \ i915_config.o \ i915_getparam.o \ i915_ioctl.o \ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 5a687a3686bd..0a1d40220020 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1330,8 +1330,7 @@ static void *reloc_iomap(struct i915_vma *batch, ggtt->vm.insert_page(>vm, i915_gem_object_get_dma_address(obj, page), offset, -i915_gem_get_pat_index(ggtt->vm.i915, - I915_CACHE_NONE), +eb->i915->pat_uc, 0); } else { offset += page << PAGE_SHIFT; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 5b0a5cf9a98a..1c8eb806b7d3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -563,11 +563,8 @@ static void dbg_poison(struct i915_ggtt *ggtt, while (size) { void __iomem *s; - ggtt->vm.insert_page(>vm, addr, -ggtt->error_capture.start, -i915_gem_get_pat_index(ggtt->vm.i915, - I915_CACHE_NONE), -0); + ggtt->vm.insert_page(>vm, addr, ggtt->error_capture.start, +ggtt->vm.i915->pat_uc, 0); mb(); s = io_mapping_map_wc(>iomap, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c index 7078af2f8f79..6bd6c239f4ac 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c @@ -58,6 +58,16 @@ i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res, I915_CACHE_NONE; } +static unsigned int +i915_ttm_cache_pat(struct drm_i915_private *i915, struct ttm_resource *res, + struct ttm_tt *ttm) +{ + return ((HAS_LLC(i915) || HAS_SNOOP(i915)) && + !i915_ttm_gtt_binds_lmem(res) && + ttm->caching == ttm_cached) ? i915->pat_wb : + i915->pat_uc; +} + static struct intel_memory_region * i915_ttm_region(struct ttm_device *bdev, int ttm_mem_type) { @@ -196,7 +206,7 @@ static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo, struct drm
[RFC 0/8] Another take on PAT/object cache mode refactoring
From: Tvrtko Ursulin Good news is that I realized series can be split after all. Bad news is that it is still a lot to go through. drm/i915: Skip clflush after GPU writes on Meteorlake This is based on what Fei found out from hardware architects. If we agree the the function this helper should achieve follow up is checking if other snoopable platforms are the same. drm/i915: Split PTE encode between Gen12 and Meteorlake Not that much related but I feel we don't need to run impossible code on platforms before Meteorlake. Shouldn't be controversial. drm/i915: Cache PAT index used by the driver This one shouldn't be controversial either. Just eliminates a pile of calls to i915_gem_get_pat_index(). drm/i915: Refactor PAT/object cache handling This is most code and the "table reversal" logic which makes i915 understands caching modes behind PAT indices. Review for taste and general "does it make sense" is needed here. Oh and extra care about boolean logic conversion as I was pulling out obj->user_pat_set from inside i915_gem_object_has_cache_level to the call sites. All magic "if user PAT is set assume the worst" are still left in with this patch. drm/i915: Improve the vm_fault_gtt user PAT index restriction drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush drm/i915: Lift the user PAT restriction from use_cpu_reloc drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc This bunch is what removes the "user PAT set special casing". Each of them probably have different reasons why the original cache level check was in them so as many extra pair of eyes as possible are needed to verify both that I have correctly understood what the underlying reasons why each were there, and that I haven't fumbled the logic on the rudimentary level. Or perhaps that it is possible to simplify this further. By maybe using more of I915_BO_CACHE_COHERENT_FOR_... flags, or something. Overall, a lot of scrutiny is needed for most of the series since it is complicated and I am juggling multiple things. Cc: Fei Yang Cc: Matt Roper Tvrtko Ursulin (8): drm/i915: Skip clflush after GPU writes on Meteorlake drm/i915: Split PTE encode between Gen12 and Meteorlake drm/i915: Cache PAT index used by the driver drm/i915: Refactor PAT/object cache handling drm/i915: Improve the vm_fault_gtt user PAT index restriction drm/i915: Lift the user PAT restriction from gpu_write_needs_clflush drm/i915: Lift the user PAT restriction from use_cpu_reloc drm/i915: Refine the caching check in i915_gem_object_can_bypass_llc drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/gem/i915_gem_domain.c| 67 ++--- drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 11 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 12 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 135 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 +-- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 9 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 46 +++--- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 2 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 5 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 40 -- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 33 ++--- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 +- drivers/gpu/drm/i915/i915_cache.c | 93 drivers/gpu/drm/i915/i915_cache.h | 81 +++ drivers/gpu/drm/i915/i915_debugfs.c | 53 +-- drivers/gpu/drm/i915/i915_driver.c| 5 + drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_gem.c | 21 +-- drivers/gpu/drm/i915/i915_gpu_error.c | 8 +- drivers/gpu/drm/i915/i915_pci.c | 84 ++- drivers/gpu/drm/i915/i915_perf.c
[PATCH 5/5] drm/i915: Implement fdinfo memory stats printing
From: Tvrtko Ursulin Use the newly added drm_print_memory_stats helper to show memory utilisation of our objects in drm/driver specific fdinfo output. To collect the stats we walk the per memory regions object lists and accumulate object size into the respective drm_memory_stats categories. Objects with multiple possible placements are reported in multiple regions for total and shared sizes, while other categories are counted only for the currently active region. Signed-off-by: Tvrtko Ursulin Cc: Aravind Iddamsetty Cc: Rob Clark --- drivers/gpu/drm/i915/i915_drm_client.c | 85 ++ 1 file changed, 85 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index a61356012df8..9e7a6075ee25 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -45,6 +45,89 @@ void __i915_drm_client_free(struct kref *kref) } #ifdef CONFIG_PROC_FS +static void +obj_meminfo(struct drm_i915_gem_object *obj, + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN]) +{ + struct intel_memory_region *mr; + u64 sz = obj->base.size; + enum intel_region_id id; + unsigned int i; + + /* Attribute size and shared to all possible memory regions. */ + for (i = 0; i < obj->mm.n_placements; i++) { + mr = obj->mm.placements[i]; + id = mr->id; + + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + /* Attribute other categories to only the current region. */ + mr = obj->mm.region; + if (mr) + id = mr->id; + else + id = INTEL_REGION_SMEM; + + if (!obj->mm.n_placements) { + if (obj->base.handle_count > 1) + stats[id].shared += sz; + else + stats[id].private += sz; + } + + if (i915_gem_object_has_pages(obj)) { + stats[id].resident += sz; + + if (!dma_resv_test_signaled(obj->base.resv, + dma_resv_usage_rw(true))) + stats[id].active += sz; + else if (i915_gem_object_is_shrinkable(obj) && +obj->mm.madv == I915_MADV_DONTNEED) + stats[id].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + struct drm_memory_stats stats[INTEL_REGION_UNKNOWN] = {}; + struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_drm_client *client = fpriv->client; + struct drm_i915_private *i915 = fpriv->i915; + struct drm_i915_gem_object *obj; + struct intel_memory_region *mr; + struct list_head *pos; + unsigned int id; + + /* Public objects. */ + spin_lock(>table_lock); + idr_for_each_entry(>object_idr, obj, id) + obj_meminfo(obj, stats); + spin_unlock(>table_lock); + + /* Internal objects. */ + rcu_read_lock(); + list_for_each_rcu(pos, >objects_list) { + obj = i915_gem_object_get_rcu(list_entry(pos, typeof(*obj), +client_link)); + if (!obj) + continue; + obj_meminfo(obj, stats); + i915_gem_object_put(obj); + } + rcu_read_unlock(); + + for_each_memory_region(mr, i915, id) + drm_print_memory_stats(p, + [id], + DRM_GEM_OBJECT_RESIDENT | + DRM_GEM_OBJECT_PURGEABLE, + mr->name); +} + static const char * const uabi_class_names[] = { [I915_ENGINE_CLASS_RENDER] = "render", [I915_ENGINE_CLASS_COPY] = "copy", @@ -106,6 +189,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) * ** */ + show_meminfo(p, file); + if (GRAPHICS_VER(i915) < 8) return; -- 2.39.2
[PATCH v6 0/5] fdinfo memory stats
From: Tvrtko Ursulin A short series to enable fdinfo memory stats for i915. I added tracking of most classes of objects (user objects, page tables, context state, ring buffers) which contribute to client's memory footprint and am accouting their memory use along the similar lines as in Rob's msm code, just that with i915 specific code we can show a memory region breakdown and so support discrete and multi-tile GPUs properly. And also reflect that our objects can have multiple allowed backing stores. The existing helper Rob added is then used to dump the per memory region stats to fdinfo. The basic objects-per-client infrastructure can later be extended to cover all objects and so avoid needing to walk the IDR under the client's file table lock, which would further avoid distburbing the running clients by parallel fdinfo readers. Example fdinfo format: # cat /proc/1383/fdinfo/8 pos:0 flags: 0212 mnt_id: 21 ino:397 drm-driver: i915 drm-client-id: 18 drm-pdev: :00:02.0 drm-total-system: 125 MiB drm-shared-system: 16 MiB drm-active-system: 110 MiB drm-resident-system:125 MiB drm-purgeable-system: 2 MiB drm-total-stolen-system:0 drm-shared-stolen-system: 0 drm-active-stolen-system: 0 drm-resident-stolen-system: 0 drm-purgeable-stolen-system:0 drm-engine-render: 25662044495 ns drm-engine-copy:0 ns drm-engine-video: 0 ns drm-engine-video-enhance: 0 ns Example gputop output: DRM minor 0 PID SMEM SMEMRSS render copy videoNAME 1233 124M 124M |||||||| neverball 1130 59M 59M |█▌ ||||||| Xorg 1207 12M 12M |||||||| xfwm4 Or with Wayland: DRM minor 0 PID MEM RSSrendercopy videovideo-enhance NAME 2093 191M 191M |▊ || || || | gnome-shell DRM minor 128 PID MEM RSSrendercopy videovideo-enhance NAME 2551 71M 71M |██▉|| || || | neverball 2553 50M 50M | || || || | Xwayland v2: * Now actually per client. v3: * Track imported dma-buf objects. v4: * Rely on DRM GEM handles for tracking user objects. * Fix internal object accounting (no placements). v5: * Fixed brain fart of overwriting the loop cursor. * Fixed object destruction racing with fdinfo reads. * Take reference to GEM context while using it. v6: * Rebase, cover letter update. Tvrtko Ursulin (5): drm/i915: Add ability for tracking buffer objects per client drm/i915: Record which client owns a VM drm/i915: Track page table backing store usage drm/i915: Account ring buffer and context state storage drm/i915: Implement fdinfo memory stats printing drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 +- .../gpu/drm/i915/gem/i915_gem_context_types.h | 3 + drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 ++ .../gpu/drm/i915/gem/selftests/mock_context.c | 4 +- drivers/gpu/drm/i915/gt/intel_context.c | 14 ++ drivers/gpu/drm/i915/gt/intel_gtt.c | 6 + drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + drivers/gpu/drm/i915/i915_drm_client.c| 131 ++ drivers/gpu/drm/i915/i915_drm_client.h| 41 ++ 10 files changed, 228 insertions(+), 8 deletions(-) -- 2.39.2
[PATCH 1/5] drm/i915: Add ability for tracking buffer objects per client
From: Tvrtko Ursulin In order to show per client memory usage lets add some infrastructure which enables tracking buffer objects owned by clients. We add a per client list protected by a new per client lock and to support delayed destruction (post client exit) we make tracked objects hold references to the owning client. Also, object memory region teardown is moved to the existing RCU free callback to allow safe dereference from the fdinfo RCU read section. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_object.c| 13 +-- .../gpu/drm/i915/gem/i915_gem_object_types.h | 12 +++ drivers/gpu/drm/i915/i915_drm_client.c| 36 +++ drivers/gpu/drm/i915/i915_drm_client.h| 32 + 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index 97ac6fb37958..3dc4fbb67d2b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -105,6 +105,10 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(>mm.link); +#ifdef CONFIG_PROC_FS + INIT_LIST_HEAD(>client_link); +#endif + INIT_LIST_HEAD(>lut_list); spin_lock_init(>lut_lock); @@ -292,6 +296,10 @@ void __i915_gem_free_object_rcu(struct rcu_head *head) container_of(head, typeof(*obj), rcu); struct drm_i915_private *i915 = to_i915(obj->base.dev); + /* We need to keep this alive for RCU read access from fdinfo. */ + if (obj->mm.n_placements > 1) + kfree(obj->mm.placements); + i915_gem_object_free(obj); GEM_BUG_ON(!atomic_read(>mm.free_count)); @@ -388,9 +396,6 @@ void __i915_gem_free_object(struct drm_i915_gem_object *obj) if (obj->ops->release) obj->ops->release(obj); - if (obj->mm.n_placements > 1) - kfree(obj->mm.placements); - if (obj->shares_resv_from) i915_vm_resv_put(obj->shares_resv_from); @@ -441,6 +446,8 @@ static void i915_gem_free_object(struct drm_gem_object *gem_obj) GEM_BUG_ON(i915_gem_object_is_framebuffer(obj)); + i915_drm_client_remove_object(obj); + /* * Before we free the object, make sure any pure RCU-only * read-side critical sections are complete, e.g. diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index e72c57716bee..8de2b91b3edf 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -300,6 +300,18 @@ struct drm_i915_gem_object { */ struct i915_address_space *shares_resv_from; +#ifdef CONFIG_PROC_FS + /** +* @client: @i915_drm_client which created the object +*/ + struct i915_drm_client *client; + + /** +* @client_link: Link into @i915_drm_client.objects_list +*/ + struct list_head client_link; +#endif + union { struct rcu_head rcu; struct llist_node freed; diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2a44b3876cb5..2e5e69edc0f9 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -28,6 +28,10 @@ struct i915_drm_client *i915_drm_client_alloc(void) kref_init(>kref); spin_lock_init(>ctx_lock); INIT_LIST_HEAD(>ctx_list); +#ifdef CONFIG_PROC_FS + spin_lock_init(>objects_lock); + INIT_LIST_HEAD(>objects_list); +#endif return client; } @@ -108,4 +112,36 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) for (i = 0; i < ARRAY_SIZE(uabi_class_names); i++) show_client_class(p, i915, file_priv->client, i); } + +void i915_drm_client_add_object(struct i915_drm_client *client, + struct drm_i915_gem_object *obj) +{ + unsigned long flags; + + GEM_WARN_ON(obj->client); + GEM_WARN_ON(!list_empty(>client_link)); + + spin_lock_irqsave(>objects_lock, flags); + obj->client = i915_drm_client_get(client); + list_add_tail_rcu(>client_link, >objects_list); + spin_unlock_irqrestore(>objects_lock, flags); +} + +bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) +{ + struct i915_drm_client *client = fetch_and_zero(>client); + unsigned long flags; + + /* Object may not be associated with a client. */ + if (!client) + return false; + + spin_lock_irqsave(>objects_lock, flags); + list_del_rcu(>client_link); + spin_unlock_irqrestore(>objects_lock, flags); + + i915_drm_client_put(client); + + return true; +}
[PATCH 4/5] drm/i915: Account ring buffer and context state storage
From: Tvrtko Ursulin Account ring buffers and logical context space against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_context.c | 14 ++ drivers/gpu/drm/i915/i915_drm_client.c | 10 ++ drivers/gpu/drm/i915/i915_drm_client.h | 9 + 3 files changed, 33 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index a53b26178f0a..a2f1245741bb 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -6,6 +6,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "i915_drm_client.h" #include "i915_drv.h" #include "i915_trace.h" @@ -50,6 +51,7 @@ intel_context_create(struct intel_engine_cs *engine) int intel_context_alloc_state(struct intel_context *ce) { + struct i915_gem_context *ctx; int err = 0; if (mutex_lock_interruptible(>pin_mutex)) @@ -66,6 +68,18 @@ int intel_context_alloc_state(struct intel_context *ce) goto unlock; set_bit(CONTEXT_ALLOC_BIT, >flags); + + rcu_read_lock(); + ctx = rcu_dereference(ce->gem_context); + if (ctx && !kref_get_unless_zero(>ref)) + ctx = NULL; + rcu_read_unlock(); + if (ctx) { + if (ctx->client) + i915_drm_client_add_context_objects(ctx->client, + ce); + i915_gem_context_put(ctx); + } } unlock: diff --git a/drivers/gpu/drm/i915/i915_drm_client.c b/drivers/gpu/drm/i915/i915_drm_client.c index 2e5e69edc0f9..a61356012df8 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.c +++ b/drivers/gpu/drm/i915/i915_drm_client.c @@ -144,4 +144,14 @@ bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) return true; } + +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce) +{ + if (ce->state) + i915_drm_client_add_object(client, ce->state->obj); + + if (ce->ring != ce->engine->legacy.ring && ce->ring->vma) + i915_drm_client_add_object(client, ce->ring->vma->obj); +} #endif diff --git a/drivers/gpu/drm/i915/i915_drm_client.h b/drivers/gpu/drm/i915/i915_drm_client.h index 5f58fdf7dcb8..69cedfcd3d69 100644 --- a/drivers/gpu/drm/i915/i915_drm_client.h +++ b/drivers/gpu/drm/i915/i915_drm_client.h @@ -14,6 +14,7 @@ #include "i915_file_private.h" #include "gem/i915_gem_object_types.h" +#include "gt/intel_context_types.h" #define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE @@ -70,6 +71,8 @@ void i915_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file); void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj); bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj); +void i915_drm_client_add_context_objects(struct i915_drm_client *client, +struct intel_context *ce); #else static inline void i915_drm_client_add_object(struct i915_drm_client *client, struct drm_i915_gem_object *obj) @@ -79,6 +82,12 @@ static inline void i915_drm_client_add_object(struct i915_drm_client *client, static inline bool i915_drm_client_remove_object(struct drm_i915_gem_object *obj) { } + +static inline void +i915_drm_client_add_context_objects(struct i915_drm_client *client, + struct intel_context *ce) +{ +} #endif #endif /* !__I915_DRM_CLIENT_H__ */ -- 2.39.2
[PATCH 3/5] drm/i915: Track page table backing store usage
From: Tvrtko Ursulin Account page table backing store against the owning client memory usage stats. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gt/intel_gtt.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 731d9f2bbc56..065099362a98 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -58,6 +58,9 @@ struct drm_i915_gem_object *alloc_pt_lmem(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; @@ -79,6 +82,9 @@ struct drm_i915_gem_object *alloc_pt_dma(struct i915_address_space *vm, int sz) if (!IS_ERR(obj)) { obj->base.resv = i915_vm_resv_get(vm); obj->shares_resv_from = vm; + + if (vm->fpriv) + i915_drm_client_add_object(vm->fpriv->client, obj); } return obj; -- 2.39.2
[PATCH 2/5] drm/i915: Record which client owns a VM
From: Tvrtko Ursulin To enable accounting of indirect client memory usage (such as page tables) in the following patch, lets start recording the creator of each PPGTT. Signed-off-by: Tvrtko Ursulin Reviewed-by: Aravind Iddamsetty --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 11 --- drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 3 +++ drivers/gpu/drm/i915/gem/selftests/mock_context.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_gtt.h | 1 + 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 9a9ff84c90d7..35cf6608180e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -279,7 +279,8 @@ static int proto_context_set_protected(struct drm_i915_private *i915, } static struct i915_gem_proto_context * -proto_context_create(struct drm_i915_private *i915, unsigned int flags) +proto_context_create(struct drm_i915_file_private *fpriv, +struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_proto_context *pc, *err; @@ -287,6 +288,7 @@ proto_context_create(struct drm_i915_private *i915, unsigned int flags) if (!pc) return ERR_PTR(-ENOMEM); + pc->fpriv = fpriv; pc->num_user_engines = -1; pc->user_engines = NULL; pc->user_flags = BIT(UCONTEXT_BANNABLE) | @@ -1621,6 +1623,7 @@ i915_gem_create_context(struct drm_i915_private *i915, err = PTR_ERR(ppgtt); goto err_ctx; } + ppgtt->vm.fpriv = pc->fpriv; vm = >vm; } if (vm) @@ -1740,7 +1743,7 @@ int i915_gem_context_open(struct drm_i915_private *i915, /* 0 reserved for invalid/unassigned ppgtt */ xa_init_flags(_priv->vm_xa, XA_FLAGS_ALLOC1); - pc = proto_context_create(i915, 0); + pc = proto_context_create(file_priv, i915, 0); if (IS_ERR(pc)) { err = PTR_ERR(pc); goto err; @@ -1822,6 +1825,7 @@ int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data, GEM_BUG_ON(id == 0); /* reserved for invalid/unassigned ppgtt */ args->vm_id = id; + ppgtt->vm.fpriv = file_priv; return 0; err_put: @@ -2284,7 +2288,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, return -EIO; } - ext_data.pc = proto_context_create(i915, args->flags); + ext_data.pc = proto_context_create(file->driver_priv, i915, + args->flags); if (IS_ERR(ext_data.pc)) return PTR_ERR(ext_data.pc); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index cb78214a7dcd..c573c067779f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -188,6 +188,9 @@ struct i915_gem_proto_engine { * CONTEXT_CREATE_SET_PARAM during GEM_CONTEXT_CREATE. */ struct i915_gem_proto_context { + /** @fpriv: Client which creates the context */ + struct drm_i915_file_private *fpriv; + /** @vm: See _gem_context.vm */ struct i915_address_space *vm; diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c b/drivers/gpu/drm/i915/gem/selftests/mock_context.c index 8ac6726ec16b..125584ada282 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c @@ -83,7 +83,7 @@ live_context(struct drm_i915_private *i915, struct file *file) int err; u32 id; - pc = proto_context_create(i915, 0); + pc = proto_context_create(fpriv, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); @@ -152,7 +152,7 @@ kernel_context(struct drm_i915_private *i915, struct i915_gem_context *ctx; struct i915_gem_proto_context *pc; - pc = proto_context_create(i915, 0); + pc = proto_context_create(NULL, i915, 0); if (IS_ERR(pc)) return ERR_CAST(pc); diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 4d6296cdbcfd..7192a534a654 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -248,6 +248,7 @@ struct i915_address_space { struct drm_mm mm; struct intel_gt *gt; struct drm_i915_private *i915; + struct drm_i915_file_private *fpriv; struct device *dma; u64 total; /* size addr space maps (ex. 2GB for ggtt) */ u64 reserved; /* size addr space reserved */ -- 2.39.2
[PULL] drm-intel-fixes
Hi Dave, Daniel, Only two small fixes for the 6.5 RC this week - one display for display (DPT) corruption under memory pressure, and one for selftests theoretical edge case. Regards, Tvrtko drm-intel-fixes-2023-07-27: - Use shmem for dpt objects [dpt] (Radhakrishna Sripada) - Fix an error handling path in igt_write_huge() (Christophe JAILLET) The following changes since commit 6eaae198076080886b9e7d57f4ae06fa782f90ef: Linux 6.5-rc3 (2023-07-23 15:24:10 -0700) are available in the Git repository at: git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-07-27 for you to fetch changes up to e354f67733115b4453268f61e6e072e9b1ea7a2f: drm/i915: Fix an error handling path in igt_write_huge() (2023-07-25 08:38:12 +0100) - Use shmem for dpt objects [dpt] (Radhakrishna Sripada) - Fix an error handling path in igt_write_huge() (Christophe JAILLET) Christophe JAILLET (1): drm/i915: Fix an error handling path in igt_write_huge() Radhakrishna Sripada (1): drm/i915/dpt: Use shmem for dpt objects drivers/gpu/drm/i915/display/intel_dpt.c| 4 +++- drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 6 -- 2 files changed, 7 insertions(+), 3 deletions(-)
Re: [PATCH 16/17] cgroup/drm: Expose memory stats
On 21/07/2023 23:21, Tejun Heo wrote: On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote: $ cat drm.memory.stat card0 region=system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 Data is generated on demand for simplicty of implementation ie. no running totals are kept or accounted during migrations and such. Various optimisations such as cheaper collection of data are possible but deliberately left out for now. Overall, the feature is deemed to be useful to container orchestration software (and manual management). Limits, either soft or hard, are not envisaged to be implemented on top of this approach due on demand nature of collecting the stats. So, yeah, if you want to add memory controls, we better think through how the fd ownership migration should work. It would be quite easy to make the implicit migration fail - just the matter of failing the first ioctl, which is what triggers the migration, after the file descriptor access from a new owner. But I don't think I can really add that in the RFC given I have no hard controls or anything like that. With GPU usage throttling it doesn't really apply, at least I don't think it does, since even when migrated to a lower budget group it would just get immediately de-prioritized. I don't think hard GPU time limits are feasible in general, and while soft might be, again I don't see that any limiting would necessarily have to run immediately on implicit migration. Second part of the story are hypothetical/future memory controls. I think first thing to say is that implicit migration is important, but it is not really established to use the file descriptor from two places or to migrate more than once. It is simply fresh fd which gets sent to clients from Xorg, which is one of the legacy ways of doing things. So we probably can just ignore that given no significant amount of memory ownership would be getting migrated. And for drm.memory.stat I think what I have is good enough - both private and shared data get accounted, for any clients that have handles to particular buffers. Maarten was working on memory controls so maybe he would have more thoughts on memory ownership and implicit migration. But I don't think there is anything incompatible with that and drm.memory.stats as proposed here, given how the categories reported are the established ones from the DRM fdinfo spec, and it is fact of the matter that we can have multiple memory regions per driver. The main thing that would change between this RFC and future memory controls in the area of drm.memory.stat is the implementation - it would have to get changed under the hood from "collect on query" to "account at allocation/free/etc". But that is just implementation details. Regards, Tvrtko
Re: [PATCH 16/17] cgroup/drm: Expose memory stats
On 26/07/2023 11:14, Maarten Lankhorst wrote: Hey, On 2023-07-22 00:21, Tejun Heo wrote: On Wed, Jul 12, 2023 at 12:46:04PM +0100, Tvrtko Ursulin wrote: $ cat drm.memory.stat card0 region=system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 Data is generated on demand for simplicty of implementation ie. no running totals are kept or accounted during migrations and such. Various optimisations such as cheaper collection of data are possible but deliberately left out for now. Overall, the feature is deemed to be useful to container orchestration software (and manual management). Limits, either soft or hard, are not envisaged to be implemented on top of this approach due on demand nature of collecting the stats. So, yeah, if you want to add memory controls, we better think through how the fd ownership migration should work. I've taken a look at the series, since I have been working on cgroup memory eviction. The scheduling stuff will work for i915, since it has a purely software execlist scheduler, but I don't think it will work for GuC (firmware) scheduling or other drivers that use the generic drm scheduler. It actually works - I used to have a blurb in the cover letter about it but apparently I dropped it. Just a bit less well with many clients, since there are fewer priority levels. All that the design requires from the invididual drivers is some way to react to the "you are over budget by this much" signal. The rest is driver and backend specific. For something like this, you would probably want it to work inside the drm scheduler first. Presumably, this can be done by setting a weight on each runqueue, and perhaps adding a callback to update one for a running queue. Calculating the weights hierarchically might be fun.. It is not needed to work in drm scheduler first. In fact drm scheduler based drivers can plug into what I have since it already has the notion of scheduling priorities. They would only need to implement a hook which allow the cgroup controller to query client GPU utilisation and another to received the over budget signal. Amdgpu and msm AFAIK could be easy candidates because they both support per client utilisation and priorities. Looks like I need to put all this info back into the cover letter. Also, hierarchic weights and time budgets are all already there. What could be done later is make this all smarter and respect the time budget with more precision. That would however, in many cases including Intel, require co-operation with the firmware. In any case it is only work in the implementation, while the cgroup control interface remains the same. I have taken a look at how the rest of cgroup controllers change ownership when moved to a different cgroup, and the answer was: not at all. If we attempt to create the scheduler controls only on the first time the fd is used, you could probably get rid of all the tracking. Can you send a CPU file descriptor from process A to process B and have CPU usage belonging to process B show up in process' A cgroup, or vice-versa? Nope, I am not making any sense, am I? My point being it is not like-to-like, model is different. No ownership transfer would mean in wide deployments all GPU utilisation would be assigned to Xorg and so there is no point to any of this. No way to throttle a cgroup with un-important GPU clients for instance. This can be done very easily with the drm scheduler. WRT memory, I think the consensus is to track system memory like normal memory. Stolen memory doesn't need to be tracked. It's kernel only memory, used for internal bookkeeping only. The only time userspace can directly manipulate stolen memory, is by mapping the pinned initial framebuffer to its own address space. The only allocation it can do is when a framebuffer is displayed, and framebuffer compression creates some stolen memory. Userspace is not aware of this though, and has no way to manipulate those contents. Stolen memory is irrelevant and not something cgroup controller knows about. Point is drivers say which memory regions they have and their utilisation. Imagine instead of stolen it said vram0, or on Intel multi-tile it shows local0 and local1. People working with containers are interested to see this breakdown. I guess the parallel and use case here is closer to memory.numa_stat. Regards, Tvrtko
Re: [PATCH 15/17] cgroup/drm: Expose GPU utilisation
On 21/07/2023 23:20, Tejun Heo wrote: On Fri, Jul 21, 2023 at 12:19:32PM -1000, Tejun Heo wrote: On Wed, Jul 12, 2023 at 12:46:03PM +0100, Tvrtko Ursulin wrote: + drm.active_us + GPU time used by the group recursively including all child groups. Maybe instead add drm.stat and have "usage_usec" inside? That'd be more consistent with cpu side. Could be, but no strong opinion from my side either way. Perhaps it boils down to what could be put in the file, I mean to decide whether keyed format makes sense or not. Also, shouldn't this be keyed by the drm device? It could have that too, or it could come later. Fun with GPUs that it not only could be keyed by the device, but also by the type of the GPU engine. (Which are a) vendor specific and b) some aree fully independent, some partially so, and some not at all - so it could get complicated semantics wise really fast.) If for now I'd go with drm.stat/usage_usec containing the total time spent how would you suggest adding per device granularity? Files as documented are either flag or nested, not both at the same time. So something like: usage_usec 10 card0 usage_usec 5 card1 usage_usec 5 Would or would not fly? Have two files along the lines of drm.stat and drm.dev_stat? While on this general topic, you will notice that for memory stats I have _sort of_ nested keyed per device format, for example on integrated Intel GPU: $ cat drm.memory.stat card0 region=system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 If one a discrete Intel GPU two more lines would appear with memory regions of local and local-system. But then on some server class multi-tile GPUs even further regions with more than one device local memory region. And users do want to see this granularity for container use cases at least. Anyway, this may not be compatible with the nested key format as documented in cgroup-v2.rst, although it does not explicitly say. Should I cheat and create key names based on device and memory region name and let userspace parse it? Like: $ cat drm.memory.stat card0.system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0.stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 Regards, Tvrtko
Re: [PATCH 12/17] cgroup/drm: Introduce weight based drm cgroup control
On 21/07/2023 23:17, Tejun Heo wrote: On Wed, Jul 12, 2023 at 12:46:00PM +0100, Tvrtko Ursulin wrote: +DRM scheduling soft limits +~~ Please don't say soft limits for this. It means something different for memcg, so it gets really confusing. Call it "weight based CPU time control" and maybe call the triggering points as thresholds. Yes sorry, you said that before and I forgot to reword it all when re-spinning. I have now marked it as TODO in my email client so hopefully next time round I don't forget. Regards, Tvrtko
[PATCH v2] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap
From: Tvrtko Ursulin Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") added a code path which does not map via GGTT, but was still setting the ggtt write bit, and so triggering the GGTT flushing. Fix it by not setting that bit unless the GGTT mapping path was used, and replace the flush with wmb() in i915_vma_flush_writes(). This also works for the i915_gem_object_pin_map path added in d976521a995a ("drm/i915: extend i915_vma_pin_iomap()"). It is hard to say if the fix has any observable effect, given that the write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but apart from code clarity, skipping the needless GGTT flushing could be beneficial on platforms with non-coherent GGTT. (See the code flow in intel_gt_flush_ggtt_writes().) v2: * Improve comment in i915_vma_flush_writes(). (Andi) Signed-off-by: Tvrtko Ursulin Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()") Cc: Radhakrishna Sripada Cc: # v5.14+ Reviewed-by: Andi Shyti --- drivers/gpu/drm/i915/i915_vma.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb425ba591c..7788b03b86d6 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) if (err) goto err_unpin; - i915_vma_set_ggtt_write(vma); + if (!i915_gem_object_is_lmem(vma->obj) && + i915_vma_is_map_and_fenceable(vma)) + i915_vma_set_ggtt_write(vma); /* NB Access through the GTT requires the device to be awake. */ return page_mask_bits(ptr); @@ -615,8 +617,19 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) void i915_vma_flush_writes(struct i915_vma *vma) { + /* +* i915_vma_iomap() could have mapped the underlying memory in one +* of the three ways, depending on which we have to choose the most +* appropriate flushing mechanism. +* +* If the mapping method was via the aperture the appropriate flag will +* be set via i915_vma_set_ggtt_write(), and if not then we know it is +* enough to simply flush the CPU side write-combine buffer. +*/ if (i915_vma_unset_ggtt_write(vma)) intel_gt_flush_ggtt_writes(vma->vm->gt); + else + wmb(); } void i915_vma_unpin_iomap(struct i915_vma *vma) -- 2.39.2
Re: [Intel-gfx] [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap
On 24/07/2023 21:16, Andi Shyti wrote: Hi Tvrtko, On Mon, Jul 24, 2023 at 01:56:33PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") added a code path which does not map via GGTT, but was still setting the ggtt write bit, and so triggering the GGTT flushing. Fix it by not setting that bit unless the GGTT mapping path was used, and replace the flush with wmb() in i915_vma_flush_writes(). This also works for the i915_gem_object_pin_map path added in d976521a995a ("drm/i915: extend i915_vma_pin_iomap()"). It is hard to say if the fix has any observable effect, given that the write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but apart from code clarity, skipping the needless GGTT flushing could be beneficial on platforms with non-coherent GGTT. (See the code flow in intel_gt_flush_ggtt_writes().) Signed-off-by: Tvrtko Ursulin Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()") Cc: Radhakrishna Sripada Cc: # v5.14+ --- drivers/gpu/drm/i915/i915_vma.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb425ba591c..f2b626cd2755 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) if (err) goto err_unpin; - i915_vma_set_ggtt_write(vma); + if (!i915_gem_object_is_lmem(vma->obj) && + i915_vma_is_map_and_fenceable(vma)) + i915_vma_set_ggtt_write(vma); /* NB Access through the GTT requires the device to be awake. */ return page_mask_bits(ptr); @@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma) { if (i915_vma_unset_ggtt_write(vma)) intel_gt_flush_ggtt_writes(vma->vm->gt); + else + wmb(); /* Just flush the write-combine buffer. */ is flush the right word? Can you expand more the explanation in this comment and why this point of synchronization is needed here? (I am even wondering if it is really needed). If you are hinting flush isn't the right word then I am not remembering what else do we use for it? It is needed because i915_flush_writes()'s point AFAIU is to make sure CPU writes after i915_vma_pin_iomap() have landed in RAM. All three methods the latter can map the buffer are WC, therefore "flushing" of the WC buffer is needed for former to do something (what it promises). Currently the wmb() is in intel_gt_flush_ggtt_writes(). But only one of the three mapping paths is via GGTT. So my logic is that calling it for paths not interacting with GGTT is confusing and not needed. Anyway, it looks good: Reviewed-by: Andi Shyti Thanks. If you don't see a hole in my logic I can improve the comment. I considered it initially but then thought it is obvious enough from looking at the i915_vma_pin_iomap. I can comment it more. Regards, Tvrtko Andi } void i915_vma_unpin_iomap(struct i915_vma *vma) -- 2.39.2
Re: [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap
On 25/07/2023 00:38, Sripada, Radhakrishna wrote: Hi Tvrtko, The changes makes sense and based on the description looks good. I am bit skeptical about the exec buffer failure reported by ci hence, withholding the r-b for now. If you believe the CI failure is unrelated please feel free to add my r-b. This failure: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_121236v1/shard-snb7/igt@gem_pp...@blt-vs-render-ctxn.html Test or machine is not entirely stable looking at it's history, but with a couple different failure signatures: https://intel-gfx-ci.01.org/tree/drm-tip/igt@gem_pp...@blt-vs-render-ctxn.html But agreed that we need to be careful. I requested a re-run for a start. On a side note on platforms with non-coherent ggtt do we really need to use the barriers twice under intel_gt_flush_ggtt_writes? You mean: intel_gt_flush_ggtt_writes() { ... wmb(); ... intel_gt_chipset_flush(); wmb(); ? I'd guess it is not needed twice on the intel_gt_flush_ggtt_writes() path, but happens to be like that for direct callers of intel_gt_chipset_flush(). Maybe there is scope to tidy this all, for instance the first direct caller I opened does this: rpcs_query_batch() { ... __i915_gem_object_flush_map(rpcs, 0, 64); i915_gem_object_unpin_map(rpcs); intel_gt_chipset_flush(vma->vm->gt); Where I think __i915_gem_object_flush_map() could actually do the right thing and issue a flush appropriate for the mapping that was used. But it is work and double flush does not really harm. I don't think it does at least. Regards, Tvrtko --Radhakrishna(RK) Sripada -Original Message- From: Tvrtko Ursulin Sent: Monday, July 24, 2023 5:57 AM To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org Cc: Ursulin, Tvrtko ; Sripada, Radhakrishna ; sta...@vger.kernel.org Subject: [PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap From: Tvrtko Ursulin Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") added a code path which does not map via GGTT, but was still setting the ggtt write bit, and so triggering the GGTT flushing. Fix it by not setting that bit unless the GGTT mapping path was used, and replace the flush with wmb() in i915_vma_flush_writes(). This also works for the i915_gem_object_pin_map path added in d976521a995a ("drm/i915: extend i915_vma_pin_iomap()"). It is hard to say if the fix has any observable effect, given that the write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but apart from code clarity, skipping the needless GGTT flushing could be beneficial on platforms with non-coherent GGTT. (See the code flow in intel_gt_flush_ggtt_writes().) Signed-off-by: Tvrtko Ursulin Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()") Cc: Radhakrishna Sripada Cc: # v5.14+ --- drivers/gpu/drm/i915/i915_vma.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb425ba591c..f2b626cd2755 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) if (err) goto err_unpin; - i915_vma_set_ggtt_write(vma); + if (!i915_gem_object_is_lmem(vma->obj) && + i915_vma_is_map_and_fenceable(vma)) + i915_vma_set_ggtt_write(vma); /* NB Access through the GTT requires the device to be awake. */ return page_mask_bits(ptr); @@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma) { if (i915_vma_unset_ggtt_write(vma)) intel_gt_flush_ggtt_writes(vma->vm->gt); + else + wmb(); /* Just flush the write-combine buffer. */ } void i915_vma_unpin_iomap(struct i915_vma *vma) -- 2.39.2
[PATCH] drm/i915: Avoid GGTT flushing on non-GGTT paths of i915_vma_pin_iomap
From: Tvrtko Ursulin Commit 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") added a code path which does not map via GGTT, but was still setting the ggtt write bit, and so triggering the GGTT flushing. Fix it by not setting that bit unless the GGTT mapping path was used, and replace the flush with wmb() in i915_vma_flush_writes(). This also works for the i915_gem_object_pin_map path added in d976521a995a ("drm/i915: extend i915_vma_pin_iomap()"). It is hard to say if the fix has any observable effect, given that the write-combine buffer gets flushed from intel_gt_flush_ggtt_writes too, but apart from code clarity, skipping the needless GGTT flushing could be beneficial on platforms with non-coherent GGTT. (See the code flow in intel_gt_flush_ggtt_writes().) Signed-off-by: Tvrtko Ursulin Fixes: 4bc91dbde0da ("drm/i915/lmem: Bypass aperture when lmem is available") References: d976521a995a ("drm/i915: extend i915_vma_pin_iomap()") Cc: Radhakrishna Sripada Cc: # v5.14+ --- drivers/gpu/drm/i915/i915_vma.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb425ba591c..f2b626cd2755 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -602,7 +602,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma) if (err) goto err_unpin; - i915_vma_set_ggtt_write(vma); + if (!i915_gem_object_is_lmem(vma->obj) && + i915_vma_is_map_and_fenceable(vma)) + i915_vma_set_ggtt_write(vma); /* NB Access through the GTT requires the device to be awake. */ return page_mask_bits(ptr); @@ -617,6 +619,8 @@ void i915_vma_flush_writes(struct i915_vma *vma) { if (i915_vma_unset_ggtt_write(vma)) intel_gt_flush_ggtt_writes(vma->vm->gt); + else + wmb(); /* Just flush the write-combine buffer. */ } void i915_vma_unpin_iomap(struct i915_vma *vma) -- 2.39.2
[PATCH] drm/i915: Tidy for_each_set_bit usage with abox_regs
From: Tvrtko Ursulin For_each_set_bit wants the max number of bits to walk and not the byte storage size of the source field. In this case there is no bug since abox_mask can mostly contain bits 0-2. Another funny thing is that both sizeof(abox_mask), where abox_mask is unsigned long, and BITS_PER_TYPE(DISPLAY_INFO->abox_mask)), are 8 (on 64-bit builds) so there is even less between them. Anyway, why not make it explicit to what the constraint is. Signed-off-by: Tvrtko Ursulin References: 62afef2811e4 ("drm/i915/rkl: RKL uses ABOX0 for pixel transfers") Cc: Ville Syrjälä Cc: Aditya Swarup Cc: Matt Roper --- drivers/gpu/drm/i915/display/intel_display_power.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 38225e5d311e..27a484892908 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -1170,7 +1170,8 @@ static void icl_mbus_init(struct drm_i915_private *dev_priv) if (DISPLAY_VER(dev_priv) == 12) abox_regs |= BIT(0); - for_each_set_bit(i, _regs, sizeof(abox_regs)) + for_each_set_bit(i, _regs, +BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask)) intel_de_rmw(dev_priv, MBUS_ABOX_CTL(i), mask, val); } @@ -1623,11 +1624,13 @@ static void tgl_bw_buddy_init(struct drm_i915_private *dev_priv) if (table[config].page_mask == 0) { drm_dbg(_priv->drm, "Unknown memory configuration; disabling address buddy logic.\n"); - for_each_set_bit(i, _mask, sizeof(abox_mask)) + for_each_set_bit(i, _mask, + BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask)) intel_de_write(dev_priv, BW_BUDDY_CTL(i), BW_BUDDY_DISABLE); } else { - for_each_set_bit(i, _mask, sizeof(abox_mask)) { + for_each_set_bit(i, _mask, + BITS_PER_TYPE(DISPLAY_INFO(dev_priv)->abox_mask)) { intel_de_write(dev_priv, BW_BUDDY_PAGE_MASK(i), table[config].page_mask); -- 2.39.2
[PATCH] drm/i915: Use the i915_vma_flush_writes helper
From: Tvrtko Ursulin We can use the existing helper in flush_write_domain() and save some lines of code. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index dfaaa8b66ac3..ffddec1d2a76 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -68,10 +68,8 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains) switch (obj->write_domain) { case I915_GEM_DOMAIN_GTT: spin_lock(>vma.lock); - for_each_ggtt_vma(vma, obj) { - if (i915_vma_unset_ggtt_write(vma)) - intel_gt_flush_ggtt_writes(vma->vm->gt); - } + for_each_ggtt_vma(vma, obj) + i915_vma_flush_writes(vma); spin_unlock(>vma.lock); i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); -- 2.39.2
Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling
On 21/07/2023 05:28, Yang, Fei wrote: [snip] @@ -27,15 +28,8 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) The code change here looks accurate, but while we're here, I have a side question about this function in general...it was originally introduced in commit 48004881f693 ("drm/i915: Mark CPU cache as dirty when used for rendering") which states that GPU rendering ends up in the CPU cache (and thus needs a clflush later to make sure it lands in memory). That makes sense to me for LLC platforms, but is it really true for non-LLC snooping platforms (like MTL) as the commit states? For non-LLC platforms objects can be set to 1-way coherent which means GPU rendering ending up in CPU cache as well, so for non-LLC platform the logic here should be checking 1-way coherent flag. That's the part that I'm questioning (and not just for MTL, but for all of our other non-LLC platforms too). Just because there's coherency doesn't mean that device writes landed in the CPU cache. Coherency is also achieved if device writes invalidate the contents of the CPU cache. I thought our non-LLC snooping platforms were coherent due to write-invalidate rather than write-update, but I can't find it specifically documented anywhere at the moment. If write-invalidate was used, then there shouldn't be a need for a later clflush either. [Trying to consolidate by doing a combined reply to the discussion so far.] On the write-invalidate vs write-update I don't know. If you did not find it in bspec then I doubt I would. I can have a browse still. Matt was correct. Quote Ron Silvas from SW ARCH, "MTL GPU doesn't write to CPU cache, it simply snoop CPU cache on its way to RAM." Does it apply to all snooping platforms? And for the cache level/mode based condition, how about replacing it with this: /* * Fully coherent cached access may end up with data in the CPU cache * which hasn't hit memory yet. */ return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) && i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W); ? Although that would mean old I915_CACHE_LLC on old platforms is actually 2-way coherent. I am struggling to find a comprehensive explanation in bspec, but for instance 605 makes it sounds like it is fully coherent. Perhaps it really is and I should fix the legacy and Gen12 table.. And if the write-invalidate applies to all snooping platforms then we extend it to: /* * Fully coherent cached access may end up with data in the CPU cache * which hasn't hit memory yet. * * But not on snooping platforms, where it is impossible due * write-invalidate. */ return !HAS_SNOOP(i915) && (i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WB) && i915_gem_object_has_cache_flag(obj, I915_CACHE_FLAG_COH2W)); That would prevent any flushing on MTL and make you happy from that aspect. In fact, the snooping check could be before the cache mode check. For i915_gem_object_can_bypass_llc it would be ideal if a condition based on the absence of I915_BO_CACHE_COHERENT_FOR_WRITE would work. At least according to the kerneldoc for @cache_coherent: * I915_BO_CACHE_COHERENT_FOR_WRITE: * * When writing through the CPU cache, the GPU is still coherent. Note * that this also implies I915_BO_CACHE_COHERENT_FOR_READ. So for objects without it set, we need to force a flush. And make __i915_gem_object_update_coherency not set it for WB without 1-way coherency set. According to bspec that would seem correct, because with 1-way snooping on MTL, GPU snoops the IA until first GPU access. So anything the CPU writes before the first GPU access would be coherent and so no need to flush in set pages. But if non-coherent WB is set then we need to flush. I'll trybot it is and see what happens. My understanding was that snooping platforms just invalidated the CPU cache to prevent future CPU reads from seeing stale data but didn't actually stick any new data in there? Am I off track or is the original logic of this function not quite right? Anyway, even if the logic of this function is wrong, it's a mistake that would only hurt performance Yes, this logic will introduce performance impact because it's missing the checking for obj->pat_set_by_user. For objects with pat_set_by_user==true, even if the object is snooping or 1-way coherent, we don't want to enforce a clflush here since the coherency is supposed to be handled by user space. What should I add you think to fix it? I think the simplest would be if (obj->pat_set_by_user) return false; because even checking for incoherent WB is unnecessary, simply no need for the KMD to initiate a flush if PAT is set by user. Add a check for non-coherent WB in gpu_write_needs_clflush as an additional condition for returning
Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling
[Here let me just focus on the points which did not get further discussion in follow ups yet.] On 19/07/2023 23:31, Matt Roper wrote: On Wed, Jul 19, 2023 at 01:37:30PM +0100, Tvrtko Ursulin wrote: From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few questionable design decisions which this patch tries to improve upon. Principal change is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level. Other changes/fixes/improvements we are able to do: 1) Replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). For instance this way we are able to express the difference between WB and 1-way coherent WB on Meteorlake. Which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. 2) We can cache PAT indices of the caching modes used by the driver itself in struct drm_i915_private, which eliminates the runtime calls to i915_gem_get_pat_index from both high- and low-level i915 components. 3) We can also cache the caching modes used by the driver for coherent access and for display buffers. 4) Remove the incorrect references to enum i915_cache_level from low level PTE encode vfuncs, since those are actually given PAT indices by their callers. 5) Because i915 now understands PAT indices, we can remove the overly aggressive flushing triggered from i915_gem_object_can_bypass_llc() and limit it to non-coherent write-back mode only. 6) Finally we are able to replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. Signed-off-by: Tvrtko Ursulin Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/Makefile | 1 + .../drm/i915/display/intel_plane_initial.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 56 --- drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 13 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 12 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 152 +++--- drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 11 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 44 ++--- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 4 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 6 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 19 +-- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 33 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 +- drivers/gpu/drm/i915/i915_cache.c | 91 +++ drivers/gpu/drm/i915/i915_cache.h | 60 +++ drivers/gpu/drm/i915/i915_debugfs.c | 53 +- drivers/gpu/drm/i915/i915_driver.c| 5 + drivers/gpu/drm/i915/i915_drv.h | 5 + drivers/gpu/drm/i915/i915_gem.c | 21 +-- drivers/gpu/drm/i915/i915_gpu_error.c | 7 +- drivers/gpu/drm/i915/i915_pci.c | 82 +- drivers/gpu/drm/i915/i915_perf.c | 2 +-
Re: [PATCH v3] drm/i915: Refactor PAT/object cache handling
On 20/07/2023 01:22, Matt Roper wrote: On Wed, Jul 19, 2023 at 05:07:15PM -0700, Yang, Fei wrote: [snip] @@ -27,15 +28,8 @@ static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj) The code change here looks accurate, but while we're here, I have a side question about this function in general...it was originally introduced in commit 48004881f693 ("drm/i915: Mark CPU cache as dirty when used for rendering") which states that GPU rendering ends up in the CPU cache (and thus needs a clflush later to make sure it lands in memory). That makes sense to me for LLC platforms, but is it really true for non-LLC snooping platforms (like MTL) as the commit states? For non-LLC platforms objects can be set to 1-way coherent which means GPU rendering ending up in CPU cache as well, so for non-LLC platform the logic here should be checking 1-way coherent flag. That's the part that I'm questioning (and not just for MTL, but for all of our other non-LLC platforms too). Just because there's coherency doesn't mean that device writes landed in the CPU cache. Coherency is also achieved if device writes invalidate the contents of the CPU cache. I thought our non-LLC snooping platforms were coherent due to write-invalidate rather than write-update, but I can't find it specifically documented anywhere at the moment. If write-invalidate was used, then there shouldn't be a need for a later clflush either. [Trying to consolidate by doing a combined reply to the discussion so far.] On the write-invalidate vs write-update I don't know. If you did not find it in bspec then I doubt I would. I can have a browse still. My understanding was that snooping platforms just invalidated the CPU cache to prevent future CPU reads from seeing stale data but didn't actually stick any new data in there? Am I off track or is the original logic of this function not quite right? Anyway, even if the logic of this function is wrong, it's a mistake that would only hurt performance Yes, this logic will introduce performance impact because it's missing the checking for obj->pat_set_by_user. For objects with pat_set_by_user==true, even if the object is snooping or 1-way coherent, we don't want to enforce a clflush here since the coherency is supposed to be handled by user space. What should I add you think to fix it? Add a check for non-coherent WB in gpu_write_needs_clflush as an additional condition for returning false? And then if Matt is correct write-invalidate is used also !HAS_LLC should just return false? (flushing more often than we truly need to) rather than functionality, so not something we really need to dig into right now as part of this patch. if (IS_DGFX(i915)) return false; -/* - * For objects created by userspace through GEM_CREATE with pat_index - * set by set_pat extension, i915_gem_object_has_cache_level() will - * always return true, because the coherency of such object is managed - * by userspace. Othereise the call here would fall back to checking - * whether the object is un-cached or write-through. - */ -return !(i915_gem_object_has_cache_level(obj, I915_CACHE_NONE) || - i915_gem_object_has_cache_level(obj, I915_CACHE_WT)); +return i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) != 1 && + i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_WT) != 1; } [snip] @@ -640,15 +640,9 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache, if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) return false; -/* - * For objects created by userspace through GEM_CREATE with pat_index - * set by set_pat extension, i915_gem_object_has_cache_level() always - * return true, otherwise the call would fall back to checking whether - * the object is un-cached. - */ return (cache->has_llc || obj->cache_dirty || -!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE)); +i915_gem_object_has_cache_mode(obj, I915_CACHE_MODE_UC) != 1); Platforms with relocations and platforms with user-specified PAT have no overlap, right? So a -1 return should be impossible here and this is one case where we could just treat the return value as a boolean, right? Hm no, or maybe. My thinking behind tri-state is to allow a safe option for "don't know". In case PAT index to cache mode table is not fully populated on some future platform. My understanding is that the condition here means to say that, if GPU access is uncached, don't use CPU reloc because the CPU cache might contain stale data. This condition is sufficient for snooping platforms. But from MTL onward, the condition show be whether the GPU access is coherent with CPU. So, we should be checking 1-way coherent flag instead of UC mode, because even if the GPU access is WB, it's still non-coherent, thus CPU cache could be out-dated. Honestly the matrix of caching decision/logic
Re: [RFC v5 00/17] DRM cgroup controller with scheduling control and memory stats
Hi, On 19/07/2023 21:31, T.J. Mercier wrote: On Wed, Jul 12, 2023 at 4:47 AM Tvrtko Ursulin wrote: drm.memory.stat A nested file containing cumulative memory statistics for the whole sub-hierarchy, broken down into separate GPUs and separate memory regions supported by the latter. For example:: $ cat drm.memory.stat card0 region=system total=12898304 shared=0 active=0 resident=12111872 purgeable=167936 card0 region=stolen-system total=0 shared=0 active=0 resident=0 purgeable=0 Card designation corresponds to the DRM device names and multiple line entries can be present per card. Memory region names should be expected to be driver specific with the exception of 'system' which is standardised and applicable for GPUs which can operate on system memory buffers. Sub-keys 'resident' and 'purgeable' are optional. Per category region usage is reported in bytes. * Feedback from people interested in drm.active_us and drm.memory.stat is required to understand the use cases and their usefulness (of the fields). Memory stats are something which was easy to add to my series, since I was already working on the fdinfo memory stats patches, but the question is how useful it is. Hi Tvrtko, I think this style of driver-defined categories for reporting of memory could potentially allow us to eliminate the GPU memory tracking tracepoint used on Android (gpu_mem_total). This would involve reading drm.memory.stat at the root cgroup (I see it's currently disabled on I can put it available under root too, don't think there is any technical reason to not have it. In fact, now that I look at it again, memory.stat is present on root so that would align with my general guideline to keep the two as similar as possible. the root), which means traversing the whole cgroup tree under the cgroup lock to generate the values on-demand. This would be done rarely, but I still wonder what the cost of that would turn out to be. Yeah that's ugly. I could eliminate cgroup_lock by being a bit smarter. Just didn't think it worth it for the RFC. Basically to account memory stats for any sub-tree I need the equivalent one struct drm_memory_stats per DRM device present in the hierarchy. So I could pre-allocate a few and restart if run out of spares, or something. They are really small so pre-allocating a good number, based on past state or something, should would good enough. Or even total number of DRM devices in a system as a pessimistic and safe option for most reasonable deployments. The drm_memory_stats categories in the output don't seem like a big value-add for this use-case, but no real objection to them being You mean the fact there are different categories is not a value add for your use case because you would only use one? The idea was to align 1:1 with DRM memory stats fdinfo and somewhat emulate how memory.stat also offers a breakdown. there. I know it's called the DRM cgroup controller, but it'd be nice if there were a way to make the mem tracking part work for any driver that wishes to participate as many of our devices don't use a DRM driver. But making that work doesn't look like it would fit very Ah that would be a challenge indeed to which I don't have any answers right now. Hm if you have a DRM device somewhere in the chain memory stats would still show up. Like if you had a dma-buf producer which is not a DRM driver, but then that buffer was imported by a DRM driver, it would show up in a cgroup. Or vice-versa. But if there aren't any in the whole chain then it would not. cleanly into this controller, so I'll just shut up now. Not all all, good feedback! Regards, Tvrtko
Re: [PATCH 2/2] drm/i915: Avoid -Wconstant-logical-operand in nsecs_to_jiffies_timeout()
On 18/07/2023 22:44, Nathan Chancellor wrote: A proposed update to clang's -Wconstant-logical-operand to warn when the left hand side is a constant shows the following instance in nsecs_to_jiffies_timeout() when NSEC_PER_SEC is not a multiple of HZ, such as CONFIG_HZ=300: drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: warning: use of logical '&&' with constant operand [-Wconstant-logical-operand] 189 | if (NSEC_PER_SEC % HZ && | ~ ^ drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: note: use '&' for a bitwise operation 189 | if (NSEC_PER_SEC % HZ && | ^~ | & drivers/gpu/drm/i915/gem/i915_gem_wait.c:189:24: note: remove constant to silence this warning 1 warning generated. Turn this into an explicit comparison against zero to make the expression a boolean to make it clear this should be a logical check, not a bitwise one. So -Wconstant-logical-operand only triggers when it is a constant but not zero constant? Why does that make sense is not a kludge to avoid too much noise? Personally, it all feels a bit over the top as a warning, since code in both cases should optimise away. And we may end up papering over it if it becomes a default. Then again this patch IMO does make the code more readable, so I am happy to take this one via our tree. Or either give ack to bring it in via drm-misc-next: Acked-by: Tvrtko Ursulin Let me know which route works best. Regards, Tvrtko Link: https://reviews.llvm.org/D142609 Signed-off-by: Nathan Chancellor --- drivers/gpu/drm/i915/gem/i915_gem_wait.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c index 4a33ad2d122b..d4b918fb11ce 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c @@ -186,7 +186,7 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj, static inline unsigned long nsecs_to_jiffies_timeout(const u64 n) { /* nsecs_to_jiffies64() does not guard against overflow */ - if (NSEC_PER_SEC % HZ && + if ((NSEC_PER_SEC % HZ) != 0 && div_u64(n, NSEC_PER_SEC) >= MAX_JIFFY_OFFSET / HZ) return MAX_JIFFY_OFFSET;
[PULL] drm-intel-fixes
Hi Dave, Daniel, Only two fixes for the 6.5 rc window this week - one perf/OA use after free on Xe_HP platforms and one defconfig build fix for GCC versions older than 8. Regards, Tvrtko drm-intel-fixes-2023-07-20: - Add sentinel to xehp_oa_b_counters [perf] (Andrzej Hajda) - Revert "drm/i915: use localized __diag_ignore_all() instead of per file" (Jani Nikula) The following changes since commit fdf0eaf11452d72945af31804e2a1048ee1b574c: Linux 6.5-rc2 (2023-07-16 15:10:37 -0700) are available in the Git repository at: git://anongit.freedesktop.org/drm/drm-intel tags/drm-intel-fixes-2023-07-20 for you to fetch changes up to 2c27770a7bc88ef7f6614d11d96d8e62017d0b78: Revert "drm/i915: use localized __diag_ignore_all() instead of per file" (2023-07-17 13:39:04 +0100) - Add sentinel to xehp_oa_b_counters [perf] (Andrzej Hajda) - Revert "drm/i915: use localized __diag_ignore_all() instead of per file" (Jani Nikula) Andrzej Hajda (1): drm/i915/perf: add sentinel to xehp_oa_b_counters Jani Nikula (1): Revert "drm/i915: use localized __diag_ignore_all() instead of per file" drivers/gpu/drm/i915/Makefile | 5 + drivers/gpu/drm/i915/display/intel_display_device.c | 5 - drivers/gpu/drm/i915/display/intel_fbdev.c | 5 - drivers/gpu/drm/i915/i915_pci.c | 5 - drivers/gpu/drm/i915/i915_perf.c| 1 + 5 files changed, 6 insertions(+), 15 deletions(-)
[PATCH v3] drm/i915: Refactor PAT/object cache handling
From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few questionable design decisions which this patch tries to improve upon. Principal change is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level. Other changes/fixes/improvements we are able to do: 1) Replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). For instance this way we are able to express the difference between WB and 1-way coherent WB on Meteorlake. Which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. 2) We can cache PAT indices of the caching modes used by the driver itself in struct drm_i915_private, which eliminates the runtime calls to i915_gem_get_pat_index from both high- and low-level i915 components. 3) We can also cache the caching modes used by the driver for coherent access and for display buffers. 4) Remove the incorrect references to enum i915_cache_level from low level PTE encode vfuncs, since those are actually given PAT indices by their callers. 5) Because i915 now understands PAT indices, we can remove the overly aggressive flushing triggered from i915_gem_object_can_bypass_llc() and limit it to non-coherent write-back mode only. 6) Finally we are able to replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) v3: * Checkpath issues. * Cache mode flags check fixed. Signed-off-by: Tvrtko Ursulin Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/Makefile | 1 + .../drm/i915/display/intel_plane_initial.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 56 --- drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 13 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 12 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 152 +++--- drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 11 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 44 ++--- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 4 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 6 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 19 +-- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 33 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 +- drivers/gpu/drm/i915/i915_cache.c | 91 +++ drivers/gpu/drm/i915/i915_cache.h | 60 +++ drivers/gpu/drm/i915/i915_debugfs.c | 53 +- drivers/gpu/drm/i915/i915_driver.c| 5 + drivers/gpu/drm/i915/i915_drv.h | 5 + drivers/gpu/drm/i915/i915_gem.c | 21 +-- drivers/gpu/drm/i915/i915_gpu_error.c | 7 +- drivers/gpu/drm/i915/i915_pci.c | 82 +- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- drivers/gpu/drm/i915/selftests/i915_gem.c | 5 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 8 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 13 +- drivers/gpu/drm/i915/selftes
[PATCH v2] drm/i915: Refactor PAT/object cache handling
From: Tvrtko Ursulin Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") has introduced PAT indices to i915 internal APIs, partially replacing the usage of driver internal cache_level, but has also added a few questionable design decisions which this patch tries to improve upon. Principal change is to invert the per platform cache level to PAT index table which was added by the referenced commit, and by doing so enable i915 to understand the cache mode between PAT indices, changing them from opaque to transparent. Once we have the inverted table we are able to remove the hidden false "return true" from i915_gem_object_has_cache_level. Other changes/fixes/improvements we are able to do: 1) Replace the enum i915_cache_level with i915_cache_t, composed of a more detailed representation of each cache mode (base mode plus flags). For instance this way we are able to express the difference between WB and 1-way coherent WB on Meteorlake. Which in turn enables us to map the i915 "cached" mode to the correct Meteorlake PAT index. 2) We can cache PAT indices of the caching modes used by the driver itself in struct drm_i915_private, which eliminates the runtime calls to i915_gem_get_pat_index from both high- and low-level i915 components. 3) We can also cache the caching modes used by the driver for coherent access and for display buffers. 4) Remove the incorrect references to enum i915_cache_level from low level PTE encode vfuncs, since those are actually given PAT indices by their callers. 5) Because i915 now understands PAT indices, we can remove the overly aggressive flushing triggered from i915_gem_object_can_bypass_llc() and limit it to non-coherent write-back mode only. 6) Finally we are able to replace the platform dependent cache mode to string code in debugfs and elsewhere by the single implementation based on i915_cache_t. v2: * Fix PAT-to-cache-mode table for PVC. (Fei) * Cache display caching mode too. (Fei) * Improve and document criteria in i915_gem_object_can_bypass_llc() (Matt) Signed-off-by: Tvrtko Ursulin Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Chris Wilson Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper --- drivers/gpu/drm/i915/Makefile | 1 + .../drm/i915/display/intel_plane_initial.c| 3 +- drivers/gpu/drm/i915/gem/i915_gem_domain.c| 56 +++ drivers/gpu/drm/i915/gem/i915_gem_domain.h| 5 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 13 +- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 12 +- drivers/gpu/drm/i915/gem/i915_gem_object.c| 147 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h| 11 +- .../gpu/drm/i915/gem/i915_gem_object_types.h | 116 +- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.c| 11 +- drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 44 +++--- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- .../drm/i915/gem/selftests/huge_gem_object.c | 4 +- .../gpu/drm/i915/gem/selftests/huge_pages.c | 6 +- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 4 +- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 19 +-- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/gt/intel_ggtt.c | 33 ++-- drivers/gpu/drm/i915/gt/intel_ggtt_gmch.c | 4 +- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 3 +- drivers/gpu/drm/i915/gt/intel_migrate.c | 11 +- drivers/gpu/drm/i915/gt/intel_ppgtt.c | 6 +- .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/gt/intel_timeline.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_migrate.c| 9 +- drivers/gpu/drm/i915/gt/selftest_reset.c | 14 +- drivers/gpu/drm/i915/gt/selftest_tlb.c| 5 +- .../gpu/drm/i915/gt/selftest_workarounds.c| 2 +- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 8 +- drivers/gpu/drm/i915/i915_cache.c | 92 +++ drivers/gpu/drm/i915/i915_cache.h | 61 drivers/gpu/drm/i915/i915_debugfs.c | 53 +-- drivers/gpu/drm/i915/i915_driver.c| 5 + drivers/gpu/drm/i915/i915_drv.h | 5 + drivers/gpu/drm/i915/i915_gem.c | 21 +-- drivers/gpu/drm/i915/i915_gpu_error.c | 7 +- drivers/gpu/drm/i915/i915_pci.c | 82 +- drivers/gpu/drm/i915/i915_perf.c | 2 +- drivers/gpu/drm/i915/intel_device_info.h | 6 +- drivers/gpu/drm/i915/selftests/i915_gem.c | 5 +- .../gpu/drm/i915/selftests/i915_gem_evict.c | 8 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 13 +- drivers/gpu/drm/i915/selftests/igt_spinner.c | 2 +- .../drm/i915/selftests/intel_mem
[CI 4/4] drm/i915: Expose RPS thresholds in sysfs
From: Tvrtko Ursulin User feedback indicates significant performance gains are possible in specific games with non default RPS up/down thresholds. Expose these tunables via sysfs which will allow users to achieve best performance when running games and best power efficiency elsewhere. Note this patch supports non GuC based platforms only. v2: * Make checkpatch happy. Signed-off-by: Tvrtko Ursulin References: https://gitlab.freedesktop.org/drm/intel/-/issues/8389 Cc: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Andi Shyti --- drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c | 108 1 file changed, 108 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c index ee2b44f896a2..f0dea54880af 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c @@ -700,6 +700,80 @@ static const struct attribute *media_perf_power_attrs[] = { NULL }; +static ssize_t +rps_up_threshold_pct_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name); + struct intel_rps *rps = >rps; + + return sysfs_emit(buf, "%u\n", intel_rps_get_up_threshold(rps)); +} + +static ssize_t +rps_up_threshold_pct_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name); + struct intel_rps *rps = >rps; + int ret; + u8 val; + + ret = kstrtou8(buf, 10, ); + if (ret) + return ret; + + ret = intel_rps_set_up_threshold(rps, val); + + return ret == 0 ? count : ret; +} + +static struct kobj_attribute rps_up_threshold_pct = + __ATTR(rps_up_threshold_pct, + 0664, + rps_up_threshold_pct_show, + rps_up_threshold_pct_store); + +static ssize_t +rps_down_threshold_pct_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name); + struct intel_rps *rps = >rps; + + return sysfs_emit(buf, "%u\n", intel_rps_get_down_threshold(rps)); +} + +static ssize_t +rps_down_threshold_pct_store(struct kobject *kobj, struct kobj_attribute *attr, +const char *buf, size_t count) +{ + struct intel_gt *gt = intel_gt_sysfs_get_drvdata(kobj, attr->attr.name); + struct intel_rps *rps = >rps; + int ret; + u8 val; + + ret = kstrtou8(buf, 10, ); + if (ret) + return ret; + + ret = intel_rps_set_down_threshold(rps, val); + + return ret == 0 ? count : ret; +} + +static struct kobj_attribute rps_down_threshold_pct = + __ATTR(rps_down_threshold_pct, + 0664, + rps_down_threshold_pct_show, + rps_down_threshold_pct_store); + +static const struct attribute * const gen6_gt_rps_attrs[] = { + _up_threshold_pct.attr, + _down_threshold_pct.attr, + NULL +}; + static ssize_t default_min_freq_mhz_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -722,9 +796,37 @@ default_max_freq_mhz_show(struct kobject *kobj, struct kobj_attribute *attr, cha static struct kobj_attribute default_max_freq_mhz = __ATTR(rps_max_freq_mhz, 0444, default_max_freq_mhz_show, NULL); +static ssize_t +default_rps_up_threshold_pct_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct intel_gt *gt = kobj_to_gt(kobj->parent); + + return sysfs_emit(buf, "%u\n", gt->defaults.rps_up_threshold); +} + +static struct kobj_attribute default_rps_up_threshold_pct = +__ATTR(rps_up_threshold_pct, 0444, default_rps_up_threshold_pct_show, NULL); + +static ssize_t +default_rps_down_threshold_pct_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct intel_gt *gt = kobj_to_gt(kobj->parent); + + return sysfs_emit(buf, "%u\n", gt->defaults.rps_down_threshold); +} + +static struct kobj_attribute default_rps_down_threshold_pct = +__ATTR(rps_down_threshold_pct, 0444, default_rps_down_threshold_pct_show, NULL); + static const struct attribute * const rps_defaults_attrs[] = { _min_freq_mhz.attr, _max_freq_mhz.attr, + _rps_up_threshold_pct.attr, + _rps_down_threshold_pct.attr, NULL }; @@ -752,6 +854,12 @@ static int intel_sysfs_rps_init(struct intel_gt *gt, struct kobject *kobj) if (IS_VALLEYVIEW(gt->i915) || IS_CHERRYVIEW(gt->i915)) ret = sysfs_create_file(kobj, vlv_attr); +
[CI 3/4] drm/i915: Add helpers for managing rps thresholds
From: Tvrtko Ursulin In preparation for exposing via sysfs add helpers for managing rps thresholds. v2: * Force sw and hw re-programming on threshold change. Signed-off-by: Tvrtko Ursulin Cc: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Andi Shyti --- drivers/gpu/drm/i915/gt/intel_rps.c | 54 + drivers/gpu/drm/i915/gt/intel_rps.h | 4 +++ 2 files changed, 58 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 69847f919586..092542f53aad 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -16,7 +16,9 @@ #include "intel_gt.h" #include "intel_gt_clock_utils.h" #include "intel_gt_irq.h" +#include "intel_gt_pm.h" #include "intel_gt_pm_irq.h" +#include "intel_gt_print.h" #include "intel_gt_regs.h" #include "intel_mchbar_regs.h" #include "intel_pcode.h" @@ -2576,6 +2578,58 @@ int intel_rps_set_min_frequency(struct intel_rps *rps, u32 val) return set_min_freq(rps, val); } +u8 intel_rps_get_up_threshold(struct intel_rps *rps) +{ + return rps->power.up_threshold; +} + +static int rps_set_threshold(struct intel_rps *rps, u8 *threshold, u8 val) +{ + int ret; + + if (val > 100) + return -EINVAL; + + ret = mutex_lock_interruptible(>lock); + if (ret) + return ret; + + if (*threshold == val) + goto out_unlock; + + *threshold = val; + + /* Force reset. */ + rps->last_freq = -1; + mutex_lock(>power.mutex); + rps->power.mode = -1; + mutex_unlock(>power.mutex); + + intel_rps_set(rps, clamp(rps->cur_freq, +rps->min_freq_softlimit, +rps->max_freq_softlimit)); + +out_unlock: + mutex_unlock(>lock); + + return ret; +} + +int intel_rps_set_up_threshold(struct intel_rps *rps, u8 threshold) +{ + return rps_set_threshold(rps, >power.up_threshold, threshold); +} + +u8 intel_rps_get_down_threshold(struct intel_rps *rps) +{ + return rps->power.down_threshold; +} + +int intel_rps_set_down_threshold(struct intel_rps *rps, u8 threshold) +{ + return rps_set_threshold(rps, >power.down_threshold, threshold); +} + static void intel_rps_set_manual(struct intel_rps *rps, bool enable) { struct intel_uncore *uncore = rps_to_uncore(rps); diff --git a/drivers/gpu/drm/i915/gt/intel_rps.h b/drivers/gpu/drm/i915/gt/intel_rps.h index a3fa987aa91f..92fb01f5a452 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.h +++ b/drivers/gpu/drm/i915/gt/intel_rps.h @@ -37,6 +37,10 @@ void intel_rps_mark_interactive(struct intel_rps *rps, bool interactive); int intel_gpu_freq(struct intel_rps *rps, int val); int intel_freq_opcode(struct intel_rps *rps, int val); +u8 intel_rps_get_up_threshold(struct intel_rps *rps); +int intel_rps_set_up_threshold(struct intel_rps *rps, u8 threshold); +u8 intel_rps_get_down_threshold(struct intel_rps *rps); +int intel_rps_set_down_threshold(struct intel_rps *rps, u8 threshold); u32 intel_rps_read_actual_frequency(struct intel_rps *rps); u32 intel_rps_read_actual_frequency_fw(struct intel_rps *rps); u32 intel_rps_get_requested_frequency(struct intel_rps *rps); -- 2.39.2
[CI 1/4] drm/i915: Move setting of rps thresholds to init
From: Tvrtko Ursulin Since 36d516be867c ("drm/i915/gt: Switch to manual evaluation of RPS") thresholds are invariant so lets move their setting to init time. Signed-off-by: Tvrtko Ursulin Cc: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Andi Shyti --- drivers/gpu/drm/i915/gt/intel_rps.c | 27 --- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index e92e626d4994..20d44549f65e 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -672,7 +672,6 @@ static void rps_set_power(struct intel_rps *rps, int new_power) { struct intel_gt *gt = rps_to_gt(rps); struct intel_uncore *uncore = gt->uncore; - u32 threshold_up = 0, threshold_down = 0; /* in % */ u32 ei_up = 0, ei_down = 0; lockdep_assert_held(>power.mutex); @@ -680,9 +679,6 @@ static void rps_set_power(struct intel_rps *rps, int new_power) if (new_power == rps->power.mode) return; - threshold_up = 95; - threshold_down = 85; - /* Note the units here are not exactly 1us, but 1280ns. */ switch (new_power) { case LOW_POWER: @@ -709,17 +705,22 @@ static void rps_set_power(struct intel_rps *rps, int new_power) GT_TRACE(gt, "changing power mode [%d], up %d%% @ %dus, down %d%% @ %dus\n", -new_power, threshold_up, ei_up, threshold_down, ei_down); +new_power, +rps->power.up_threshold, ei_up, +rps->power.down_threshold, ei_down); set(uncore, GEN6_RP_UP_EI, intel_gt_ns_to_pm_interval(gt, ei_up * 1000)); set(uncore, GEN6_RP_UP_THRESHOLD, - intel_gt_ns_to_pm_interval(gt, ei_up * threshold_up * 10)); + intel_gt_ns_to_pm_interval(gt, + ei_up * rps->power.up_threshold * 10)); set(uncore, GEN6_RP_DOWN_EI, intel_gt_ns_to_pm_interval(gt, ei_down * 1000)); set(uncore, GEN6_RP_DOWN_THRESHOLD, - intel_gt_ns_to_pm_interval(gt, ei_down * threshold_down * 10)); + intel_gt_ns_to_pm_interval(gt, + ei_down * + rps->power.down_threshold * 10)); set(uncore, GEN6_RP_CONTROL, (GRAPHICS_VER(gt->i915) > 9 ? 0 : GEN6_RP_MEDIA_TURBO) | @@ -731,8 +732,6 @@ static void rps_set_power(struct intel_rps *rps, int new_power) skip_hw_write: rps->power.mode = new_power; - rps->power.up_threshold = threshold_up; - rps->power.down_threshold = threshold_down; } static void gen6_rps_set_thresholds(struct intel_rps *rps, u8 val) @@ -1559,10 +1558,12 @@ void intel_rps_enable(struct intel_rps *rps) return; GT_TRACE(rps_to_gt(rps), -"min:%x, max:%x, freq:[%d, %d]\n", +"min:%x, max:%x, freq:[%d, %d], thresholds:[%u, %u]\n", rps->min_freq, rps->max_freq, intel_gpu_freq(rps, rps->min_freq), -intel_gpu_freq(rps, rps->max_freq)); +intel_gpu_freq(rps, rps->max_freq), +rps->power.up_threshold, +rps->power.down_threshold); GEM_BUG_ON(rps->max_freq < rps->min_freq); GEM_BUG_ON(rps->idle_freq > rps->max_freq); @@ -2015,6 +2016,10 @@ void intel_rps_init(struct intel_rps *rps) } } + /* Set default thresholds in % */ + rps->power.up_threshold = 95; + rps->power.down_threshold = 85; + /* Finally allow us to boost to max by default */ rps->boost_freq = rps->max_freq; rps->idle_freq = rps->min_freq; -- 2.39.2
[CI 2/4] drm/i915: Record default rps threshold values
From: Tvrtko Ursulin Record the default values as preparation for exposing the sysfs controls. Signed-off-by: Tvrtko Ursulin Cc: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Andi Shyti --- drivers/gpu/drm/i915/gt/intel_gt_types.h | 3 +++ drivers/gpu/drm/i915/gt/intel_rps.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index f08c2556aa25..1b22d7a50665 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -83,6 +83,9 @@ enum intel_submission_method { struct gt_defaults { u32 min_freq; u32 max_freq; + + u8 rps_up_threshold; + u8 rps_down_threshold; }; enum intel_gt_type { diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 20d44549f65e..69847f919586 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -2018,7 +2018,9 @@ void intel_rps_init(struct intel_rps *rps) /* Set default thresholds in % */ rps->power.up_threshold = 95; + rps_to_gt(rps)->defaults.rps_up_threshold = rps->power.up_threshold; rps->power.down_threshold = 85; + rps_to_gt(rps)->defaults.rps_down_threshold = rps->power.down_threshold; /* Finally allow us to boost to max by default */ rps->boost_freq = rps->max_freq; -- 2.39.2