Re: [PATCH v3 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Matthew Auld
On Tue, 2 Nov 2021 at 17:55, Thomas Hellström
 wrote:
>
>
> On 11/2/21 18:40, Matthew Auld wrote:
> > On Tue, 2 Nov 2021 at 16:39, Thomas Hellström
> >  wrote:
> >> If the initial fill blit or copy blit of an object fails, the old
> >> content of the data might be exposed and read as soon as either CPU- or
> >> GPU PTEs are set up to point at the pages.
> >>
> >> Intercept the blit fence with an async callback that checks the
> >> blit fence for errors and if there are errors performs an async cpu blit
> >> instead. If there is a failure to allocate the async dma_fence_work,
> >> allocate it on the stack and sync wait for the blit to complete.
> >>
> >> Add selftests that simulate gpu blit failures and failure to allocate
> >> the async dma_fence_work.
> >>
> >> A previous version of this pach used dma_fence_work, now that's
> >> opencoded which adds more code but might lower the latency
> >> somewhat in the common non-error case.
> >>
> >> v3:
> >> - Style fixes (Matthew Auld)
> >>
> >> Signed-off-by: Thomas Hellström 
> >> Reviewed-by: Matthew Auld 
> >> ---
> >>   drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
> >>   drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
> >>   .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
> >>   3 files changed, 295 insertions(+), 56 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
> >> b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> >> index 0ed6b7f2b95f..b89672c547f8 100644
> >> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> >> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> >> @@ -18,6 +18,29 @@
> >>   #include "gt/intel_gt.h"
> >>   #include "gt/intel_migrate.h"
> >>
> >> +/**
> >> + * DOC: Selftest failure modes for failsafe migration:
> >> + *
> >> + * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
> >> + * rather than a copy blit, and then we force the failure paths as if
> >> + * the blit fence returned an error.
> >> + *
> >> + * For fail_work_allocation we fail the kmalloc of the async worker, we
> >> + * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
> >> + * true, then a memcpy operation is performed sync.
> >> + */
> >> +#ifdef CONFIG_DRM_I915_SELFTEST
> > When pushing maybe make this:
> >
> > #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> >
> > Which seems to be consistent with most of the other places.
>
> Hmm,
>
> I noticed that i915 is doing that, although I thought these macros were
> primarily intended for C expressions?

i915 is not the only one. AFAIK it just conveniently handles both
bultin and module.

>
> /Thomas
>
>
> >
> >> +static bool fail_gpu_migration;
> >> +static bool fail_work_allocation;
> >> +
> >> +void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
> >> +   bool work_allocation)
> >> +{
> >> +   fail_gpu_migration = gpu_migration;
> >> +   fail_work_allocation = work_allocation;
> >> +}
> >> +#endif
> >> +


Re: [PATCH v3 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Thomas Hellström



On 11/2/21 18:40, Matthew Auld wrote:

On Tue, 2 Nov 2021 at 16:39, Thomas Hellström
 wrote:

If the initial fill blit or copy blit of an object fails, the old
content of the data might be exposed and read as soon as either CPU- or
GPU PTEs are set up to point at the pages.

Intercept the blit fence with an async callback that checks the
blit fence for errors and if there are errors performs an async cpu blit
instead. If there is a failure to allocate the async dma_fence_work,
allocate it on the stack and sync wait for the blit to complete.

Add selftests that simulate gpu blit failures and failure to allocate
the async dma_fence_work.

A previous version of this pach used dma_fence_work, now that's
opencoded which adds more code but might lower the latency
somewhat in the common non-error case.

v3:
- Style fixes (Matthew Auld)

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
  3 files changed, 295 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 0ed6b7f2b95f..b89672c547f8 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -18,6 +18,29 @@
  #include "gt/intel_gt.h"
  #include "gt/intel_migrate.h"

+/**
+ * DOC: Selftest failure modes for failsafe migration:
+ *
+ * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
+ * rather than a copy blit, and then we force the failure paths as if
+ * the blit fence returned an error.
+ *
+ * For fail_work_allocation we fail the kmalloc of the async worker, we
+ * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
+ * true, then a memcpy operation is performed sync.
+ */
+#ifdef CONFIG_DRM_I915_SELFTEST

When pushing maybe make this:

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)

Which seems to be consistent with most of the other places.


Hmm,

I noticed that i915 is doing that, although I thought these macros were 
primarily intended for C expressions?


/Thomas





+static bool fail_gpu_migration;
+static bool fail_work_allocation;
+
+void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
+   bool work_allocation)
+{
+   fail_gpu_migration = gpu_migration;
+   fail_work_allocation = work_allocation;
+}
+#endif
+


Re: [PATCH v3 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Matthew Auld
On Tue, 2 Nov 2021 at 16:39, Thomas Hellström
 wrote:
>
> If the initial fill blit or copy blit of an object fails, the old
> content of the data might be exposed and read as soon as either CPU- or
> GPU PTEs are set up to point at the pages.
>
> Intercept the blit fence with an async callback that checks the
> blit fence for errors and if there are errors performs an async cpu blit
> instead. If there is a failure to allocate the async dma_fence_work,
> allocate it on the stack and sync wait for the blit to complete.
>
> Add selftests that simulate gpu blit failures and failure to allocate
> the async dma_fence_work.
>
> A previous version of this pach used dma_fence_work, now that's
> opencoded which adds more code but might lower the latency
> somewhat in the common non-error case.
>
> v3:
> - Style fixes (Matthew Auld)
>
> Signed-off-by: Thomas Hellström 
> Reviewed-by: Matthew Auld 
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
>  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
>  .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
>  3 files changed, 295 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> index 0ed6b7f2b95f..b89672c547f8 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
> @@ -18,6 +18,29 @@
>  #include "gt/intel_gt.h"
>  #include "gt/intel_migrate.h"
>
> +/**
> + * DOC: Selftest failure modes for failsafe migration:
> + *
> + * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
> + * rather than a copy blit, and then we force the failure paths as if
> + * the blit fence returned an error.
> + *
> + * For fail_work_allocation we fail the kmalloc of the async worker, we
> + * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
> + * true, then a memcpy operation is performed sync.
> + */
> +#ifdef CONFIG_DRM_I915_SELFTEST

When pushing maybe make this:

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)

Which seems to be consistent with most of the other places.

> +static bool fail_gpu_migration;
> +static bool fail_work_allocation;
> +
> +void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
> +   bool work_allocation)
> +{
> +   fail_gpu_migration = gpu_migration;
> +   fail_work_allocation = work_allocation;
> +}
> +#endif
> +


[PATCH v3 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Thomas Hellström
If the initial fill blit or copy blit of an object fails, the old
content of the data might be exposed and read as soon as either CPU- or
GPU PTEs are set up to point at the pages.

Intercept the blit fence with an async callback that checks the
blit fence for errors and if there are errors performs an async cpu blit
instead. If there is a failure to allocate the async dma_fence_work,
allocate it on the stack and sync wait for the blit to complete.

Add selftests that simulate gpu blit failures and failure to allocate
the async dma_fence_work.

A previous version of this pach used dma_fence_work, now that's
opencoded which adds more code but might lower the latency
somewhat in the common non-error case.

v3:
- Style fixes (Matthew Auld)

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
 3 files changed, 295 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 0ed6b7f2b95f..b89672c547f8 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -18,6 +18,29 @@
 #include "gt/intel_gt.h"
 #include "gt/intel_migrate.h"
 
+/**
+ * DOC: Selftest failure modes for failsafe migration:
+ *
+ * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
+ * rather than a copy blit, and then we force the failure paths as if
+ * the blit fence returned an error.
+ *
+ * For fail_work_allocation we fail the kmalloc of the async worker, we
+ * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
+ * true, then a memcpy operation is performed sync.
+ */
+#ifdef CONFIG_DRM_I915_SELFTEST
+static bool fail_gpu_migration;
+static bool fail_work_allocation;
+
+void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
+   bool work_allocation)
+{
+   fail_gpu_migration = gpu_migration;
+   fail_work_allocation = work_allocation;
+}
+#endif
+
 static enum i915_cache_level
 i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res,
 struct ttm_tt *ttm)
@@ -129,11 +152,11 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo)
return 0;
 }
 
-static int i915_ttm_accel_move(struct ttm_buffer_object *bo,
-  bool clear,
-  struct ttm_resource *dst_mem,
-  struct ttm_tt *dst_ttm,
-  struct sg_table *dst_st)
+static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo,
+bool clear,
+struct ttm_resource *dst_mem,
+struct ttm_tt *dst_ttm,
+struct sg_table *dst_st)
 {
struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
 bdev);
@@ -144,30 +167,29 @@ static int i915_ttm_accel_move(struct ttm_buffer_object 
*bo,
int ret;
 
if (!i915->gt.migrate.context || intel_gt_is_wedged(>gt))
-   return -EINVAL;
+   return ERR_PTR(-EINVAL);
+
+   /* With fail_gpu_migration, we always perform a GPU clear. */
+   if (I915_SELFTEST_ONLY(fail_gpu_migration))
+   clear = true;
 
dst_level = i915_ttm_cache_level(i915, dst_mem, dst_ttm);
if (clear) {
-   if (bo->type == ttm_bo_type_kernel)
-   return -EINVAL;
+   if (bo->type == ttm_bo_type_kernel &&
+   !I915_SELFTEST_ONLY(fail_gpu_migration))
+   return ERR_PTR(-EINVAL);
 
intel_engine_pm_get(i915->gt.migrate.context->engine);
ret = intel_context_migrate_clear(i915->gt.migrate.context, 
NULL,
  dst_st->sgl, dst_level,
  
i915_ttm_gtt_binds_lmem(dst_mem),
  0, );
-
-   if (!ret && rq) {
-   i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
-   i915_request_put(rq);
-   }
-   intel_engine_pm_put(i915->gt.migrate.context->engine);
} else {
struct i915_refct_sgt *src_rsgt =
i915_ttm_resource_get_st(obj, bo->resource);
 
if (IS_ERR(src_rsgt))
-   return PTR_ERR(src_rsgt);
+   return ERR_CAST(src_rsgt);
 
src_level = i915_ttm_cache_level(i915, bo->resource, src_ttm);
intel_engine_pm_get(i915->gt.migrate.context->engine);
@@ -178,15 +200,182 @@ static int