Re: [PATCH v2 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Thomas Hellström

Thanks for reviewing Matt,

On 11/2/21 14:55, Matthew Auld wrote:

On 01/11/2021 18:38, Thomas Hellström wrote:

If the initial fill blit or copy blit of an object fails, the old
content of the data might be exposed and read as soon as either CPU- or
GPU PTEs are set up to point at the pages.

Intercept the blit fence with an async callback that checks the
blit fence for errors and if there are errors performs an async cpu blit
instead. If there is a failure to allocate the async dma_fence_work,
allocate it on the stack and sync wait for the blit to complete.

Add selftests that simulate gpu blit failures and failure to allocate
the async dma_fence_work.

A previous version of this pach used dma_fence_work, now that's
opencoded which adds more code but might lower the latency
somewhat in the common non-error case.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
  3 files changed, 295 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c

index 0ed6b7f2b95f..2e3328e2b48c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -18,6 +18,29 @@
  #include "gt/intel_gt.h"
  #include "gt/intel_migrate.h"
  +/**
+ * DOC: Selftest failure modes for failsafe migration:
+ *
+ * For fail_gpu_migration, the gpu blit scheduled is always a clear 
blit

+ * rather than a copy blit, and then we force the failure paths as if
+ * the blit fence returned an error.
+ *
+ * For fail_work_allocation we fail the kmalloc of the async worker, we
+ * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
+ * true, then a memcpy operation is performed sync.
+ */
+I915_SELFTEST_DECLARE(static bool fail_gpu_migration;)
+I915_SELFTEST_DECLARE(static bool fail_work_allocation;)


Might as well move these under the CONFIG_SELFTEST below, and then 
drop the DECLARE stuff?



+
+#ifdef CONFIG_DRM_I915_SELFTEST
+void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
+    bool work_allocation)
+{
+    fail_gpu_migration = gpu_migration;
+    fail_work_allocation = work_allocation;
+}
+#endif
+
  static enum i915_cache_level
  i915_ttm_cache_level(struct drm_i915_private *i915, struct 
ttm_resource *res,

   struct ttm_tt *ttm)
@@ -129,11 +152,11 @@ int i915_ttm_move_notify(struct 
ttm_buffer_object *bo)

  return 0;
  }
  -static int i915_ttm_accel_move(struct ttm_buffer_object *bo,
-   bool clear,
-   struct ttm_resource *dst_mem,
-   struct ttm_tt *dst_ttm,
-   struct sg_table *dst_st)
+static struct dma_fence *i915_ttm_accel_move(struct 
ttm_buffer_object *bo,

+ bool clear,
+ struct ttm_resource *dst_mem,
+ struct ttm_tt *dst_ttm,
+ struct sg_table *dst_st)
  {
  struct drm_i915_private *i915 = container_of(bo->bdev, 
typeof(*i915),

   bdev);
@@ -144,30 +167,29 @@ static int i915_ttm_accel_move(struct 
ttm_buffer_object *bo,

  int ret;
    if (!i915->gt.migrate.context || intel_gt_is_wedged(&i915->gt))
-    return -EINVAL;
+    return ERR_PTR(-EINVAL);
+
+    /* With fail_gpu_migration, we always perform a GPU clear. */
+    if (I915_SELFTEST_ONLY(fail_gpu_migration))
+    clear = true;
    dst_level = i915_ttm_cache_level(i915, dst_mem, dst_ttm);
  if (clear) {
-    if (bo->type == ttm_bo_type_kernel)
-    return -EINVAL;
+    if (bo->type == ttm_bo_type_kernel &&
+    !I915_SELFTEST_ONLY(fail_gpu_migration))
+    return ERR_PTR(-EINVAL);
intel_engine_pm_get(i915->gt.migrate.context->engine);
  ret = intel_context_migrate_clear(i915->gt.migrate.context, 
NULL,

    dst_st->sgl, dst_level,
    i915_ttm_gtt_binds_lmem(dst_mem),
    0, &rq);
-
-    if (!ret && rq) {
-    i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
-    i915_request_put(rq);
-    }
- intel_engine_pm_put(i915->gt.migrate.context->engine);
  } else {
  struct i915_refct_sgt *src_rsgt =
  i915_ttm_resource_get_st(obj, bo->resource);
    if (IS_ERR(src_rsgt))
-    return PTR_ERR(src_rsgt);
+    return ERR_CAST(src_rsgt);
    src_level = i915_ttm_cache_level(i915, bo->resource, 
src_ttm);

intel_engine_pm_get(i915->gt.migrate.context->engine);
@@ -178,15 +200,183 @@ static int i915_ttm_accel_move(struct 
ttm_buffer_object *bo,

   dst_st->sgl, dst_level,
   i915_ttm_gtt_binds_lmem(dst_mem),
   &rq);
+
  i915_refct_sgt_put(src_rsg

Re: [PATCH v2 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-02 Thread Matthew Auld

On 01/11/2021 18:38, Thomas Hellström wrote:

If the initial fill blit or copy blit of an object fails, the old
content of the data might be exposed and read as soon as either CPU- or
GPU PTEs are set up to point at the pages.

Intercept the blit fence with an async callback that checks the
blit fence for errors and if there are errors performs an async cpu blit
instead. If there is a failure to allocate the async dma_fence_work,
allocate it on the stack and sync wait for the blit to complete.

Add selftests that simulate gpu blit failures and failure to allocate
the async dma_fence_work.

A previous version of this pach used dma_fence_work, now that's
opencoded which adds more code but might lower the latency
somewhat in the common non-error case.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
  3 files changed, 295 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 0ed6b7f2b95f..2e3328e2b48c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -18,6 +18,29 @@
  #include "gt/intel_gt.h"
  #include "gt/intel_migrate.h"
  
+/**

+ * DOC: Selftest failure modes for failsafe migration:
+ *
+ * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
+ * rather than a copy blit, and then we force the failure paths as if
+ * the blit fence returned an error.
+ *
+ * For fail_work_allocation we fail the kmalloc of the async worker, we
+ * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
+ * true, then a memcpy operation is performed sync.
+ */
+I915_SELFTEST_DECLARE(static bool fail_gpu_migration;)
+I915_SELFTEST_DECLARE(static bool fail_work_allocation;)


Might as well move these under the CONFIG_SELFTEST below, and then drop 
the DECLARE stuff?



+
+#ifdef CONFIG_DRM_I915_SELFTEST
+void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
+   bool work_allocation)
+{
+   fail_gpu_migration = gpu_migration;
+   fail_work_allocation = work_allocation;
+}
+#endif
+
  static enum i915_cache_level
  i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res,
 struct ttm_tt *ttm)
@@ -129,11 +152,11 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo)
return 0;
  }
  
-static int i915_ttm_accel_move(struct ttm_buffer_object *bo,

-  bool clear,
-  struct ttm_resource *dst_mem,
-  struct ttm_tt *dst_ttm,
-  struct sg_table *dst_st)
+static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo,
+bool clear,
+struct ttm_resource *dst_mem,
+struct ttm_tt *dst_ttm,
+struct sg_table *dst_st)
  {
struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
 bdev);
@@ -144,30 +167,29 @@ static int i915_ttm_accel_move(struct ttm_buffer_object 
*bo,
int ret;
  
  	if (!i915->gt.migrate.context || intel_gt_is_wedged(&i915->gt))

-   return -EINVAL;
+   return ERR_PTR(-EINVAL);
+
+   /* With fail_gpu_migration, we always perform a GPU clear. */
+   if (I915_SELFTEST_ONLY(fail_gpu_migration))
+   clear = true;
  
  	dst_level = i915_ttm_cache_level(i915, dst_mem, dst_ttm);

if (clear) {
-   if (bo->type == ttm_bo_type_kernel)
-   return -EINVAL;
+   if (bo->type == ttm_bo_type_kernel &&
+   !I915_SELFTEST_ONLY(fail_gpu_migration))
+   return ERR_PTR(-EINVAL);
  
  		intel_engine_pm_get(i915->gt.migrate.context->engine);

ret = intel_context_migrate_clear(i915->gt.migrate.context, 
NULL,
  dst_st->sgl, dst_level,
  
i915_ttm_gtt_binds_lmem(dst_mem),
  0, &rq);
-
-   if (!ret && rq) {
-   i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
-   i915_request_put(rq);
-   }
-   intel_engine_pm_put(i915->gt.migrate.context->engine);
} else {
struct i915_refct_sgt *src_rsgt =
i915_ttm_resource_get_st(obj, bo->resource);
  
  		if (IS_ERR(src_rsgt))

-   return PTR_ERR(src_rsgt);
+   return ERR_CAST(src_rsgt);
  
  		src_level = i915_ttm_cache_level(i915, bo->resource, s

[PATCH v2 2/2] drm/i915/ttm: Failsafe migration blits

2021-11-01 Thread Thomas Hellström
If the initial fill blit or copy blit of an object fails, the old
content of the data might be exposed and read as soon as either CPU- or
GPU PTEs are set up to point at the pages.

Intercept the blit fence with an async callback that checks the
blit fence for errors and if there are errors performs an async cpu blit
instead. If there is a failure to allocate the async dma_fence_work,
allocate it on the stack and sync wait for the blit to complete.

Add selftests that simulate gpu blit failures and failure to allocate
the async dma_fence_work.

A previous version of this pach used dma_fence_work, now that's
opencoded which adds more code but might lower the latency
somewhat in the common non-error case.

Signed-off-by: Thomas Hellström 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 322 +++---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.h  |   5 +
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  24 +-
 3 files changed, 295 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 0ed6b7f2b95f..2e3328e2b48c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -18,6 +18,29 @@
 #include "gt/intel_gt.h"
 #include "gt/intel_migrate.h"
 
+/**
+ * DOC: Selftest failure modes for failsafe migration:
+ *
+ * For fail_gpu_migration, the gpu blit scheduled is always a clear blit
+ * rather than a copy blit, and then we force the failure paths as if
+ * the blit fence returned an error.
+ *
+ * For fail_work_allocation we fail the kmalloc of the async worker, we
+ * sync the gpu blit. If it then fails, or fail_gpu_migration is set to
+ * true, then a memcpy operation is performed sync.
+ */
+I915_SELFTEST_DECLARE(static bool fail_gpu_migration;)
+I915_SELFTEST_DECLARE(static bool fail_work_allocation;)
+
+#ifdef CONFIG_DRM_I915_SELFTEST
+void i915_ttm_migrate_set_failure_modes(bool gpu_migration,
+   bool work_allocation)
+{
+   fail_gpu_migration = gpu_migration;
+   fail_work_allocation = work_allocation;
+}
+#endif
+
 static enum i915_cache_level
 i915_ttm_cache_level(struct drm_i915_private *i915, struct ttm_resource *res,
 struct ttm_tt *ttm)
@@ -129,11 +152,11 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo)
return 0;
 }
 
-static int i915_ttm_accel_move(struct ttm_buffer_object *bo,
-  bool clear,
-  struct ttm_resource *dst_mem,
-  struct ttm_tt *dst_ttm,
-  struct sg_table *dst_st)
+static struct dma_fence *i915_ttm_accel_move(struct ttm_buffer_object *bo,
+bool clear,
+struct ttm_resource *dst_mem,
+struct ttm_tt *dst_ttm,
+struct sg_table *dst_st)
 {
struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
 bdev);
@@ -144,30 +167,29 @@ static int i915_ttm_accel_move(struct ttm_buffer_object 
*bo,
int ret;
 
if (!i915->gt.migrate.context || intel_gt_is_wedged(&i915->gt))
-   return -EINVAL;
+   return ERR_PTR(-EINVAL);
+
+   /* With fail_gpu_migration, we always perform a GPU clear. */
+   if (I915_SELFTEST_ONLY(fail_gpu_migration))
+   clear = true;
 
dst_level = i915_ttm_cache_level(i915, dst_mem, dst_ttm);
if (clear) {
-   if (bo->type == ttm_bo_type_kernel)
-   return -EINVAL;
+   if (bo->type == ttm_bo_type_kernel &&
+   !I915_SELFTEST_ONLY(fail_gpu_migration))
+   return ERR_PTR(-EINVAL);
 
intel_engine_pm_get(i915->gt.migrate.context->engine);
ret = intel_context_migrate_clear(i915->gt.migrate.context, 
NULL,
  dst_st->sgl, dst_level,
  
i915_ttm_gtt_binds_lmem(dst_mem),
  0, &rq);
-
-   if (!ret && rq) {
-   i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
-   i915_request_put(rq);
-   }
-   intel_engine_pm_put(i915->gt.migrate.context->engine);
} else {
struct i915_refct_sgt *src_rsgt =
i915_ttm_resource_get_st(obj, bo->resource);
 
if (IS_ERR(src_rsgt))
-   return PTR_ERR(src_rsgt);
+   return ERR_CAST(src_rsgt);
 
src_level = i915_ttm_cache_level(i915, bo->resource, src_ttm);
intel_engine_pm_get(i915->gt.migrate.context->engine);
@@ -178,15 +200,183 @@ static int i915_tt