Re: [Intel-gfx] [PATCH v7] drm/i915: Update error capture code to avoid using the current vma state

2021-12-07 Thread Thomas Hellström

Hi, John.

On 12/7/21 21:46, John Harrison wrote:

On 11/29/2021 12:22, Thomas Hellström wrote:

With asynchronous migrations, the vma state may be several migrations
ahead of the state that matches the request we're capturing.
Address that by introducing an i915_vma_snapshot structure that
can be used to snapshot relevant state at request submission.
In order to make sure we access the correct memory, the snapshots take
references on relevant sg-tables and memory regions.

Also move the capture list allocation out of the fence signaling
critical path and use the CONFIG_DRM_I915_CAPTURE_ERROR define to
avoid compiling in members and functions used for error capture
when they're not used.

Finally, Introduce lockdep annotation.

v4:
- Break out the capture allocation mode change to a separate patch.
v5:
- Fix compilation error in the !CONFIG_DRM_I915_CAPTURE_ERROR case
   (kernel test robot)
v6:
- Use #if IS_ENABLED() instead of #ifdef to match driver style.
- Move yet another change of allocation mode to the separate patch.
- Commit message rework due to patch reordering.
v7:
- Adjust for removal of region refcounting.

Signed-off-by: Thomas Hellström 
Reviewed-by: Ramalingam C 
---
  drivers/gpu/drm/i915/Makefile |   1 +
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 135 +++---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   8 +-
  drivers/gpu/drm/i915/i915_gpu_error.c | 176 +-
  drivers/gpu/drm/i915/i915_request.c   |  63 +--
  drivers/gpu/drm/i915/i915_request.h   |  20 +-
  drivers/gpu/drm/i915/i915_vma_snapshot.c  | 134 +
  drivers/gpu/drm/i915/i915_vma_snapshot.h  | 112 +++
  8 files changed, 554 insertions(+), 95 deletions(-)
  create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.c
  create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.h

diff --git a/drivers/gpu/drm/i915/Makefile 
b/drivers/gpu/drm/i915/Makefile

index 3f57e85d4846..3b5857da4123 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -173,6 +173,7 @@ i915-y += \
    i915_trace_points.o \
    i915_ttm_buddy_manager.o \
    i915_vma.o \
+  i915_vma_snapshot.o \
    intel_wopcm.o
    # general-purpose microcontroller (GuC) support
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

index 9f7c6ecadb90..6a0ed537c199 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -29,6 +29,7 @@
  #include "i915_gem_ioctls.h"
  #include "i915_trace.h"
  #include "i915_user_extensions.h"
+#include "i915_vma_snapshot.h"
    struct eb_vma {
  struct i915_vma *vma;
@@ -307,11 +308,15 @@ struct i915_execbuffer {
    struct eb_fence *fences;
  unsigned long num_fences;
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+    struct i915_capture_list *capture_lists[MAX_ENGINE_INSTANCE + 1];
+#endif
  };
    static int eb_parse(struct i915_execbuffer *eb);
  static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
  static void eb_unpin_engine(struct i915_execbuffer *eb);
+static void eb_capture_release(struct i915_execbuffer *eb);
    static inline bool eb_use_cmdparser(const struct i915_execbuffer 
*eb)

  {
@@ -1054,6 +1059,7 @@ static void eb_release_vmas(struct 
i915_execbuffer *eb, bool final)

  i915_vma_put(vma);
  }
  +    eb_capture_release(eb);
  eb_unpin_engine(eb);
  }
  @@ -1891,36 +1897,113 @@ eb_find_first_request_added(struct 
i915_execbuffer *eb)

  return NULL;
  }
  -static int eb_move_to_gpu(struct i915_execbuffer *eb)
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
+/* Stage with GFP_KERNEL allocations before we enter the signaling 
critical path */

+static void eb_capture_stage(struct i915_execbuffer *eb)
  {
  const unsigned int count = eb->buffer_count;
-    unsigned int i = count;
-    int err = 0, j;
+    unsigned int i = count, j;
+    struct i915_vma_snapshot *vsnap;
    while (i--) {
  struct eb_vma *ev = >vma[i];
  struct i915_vma *vma = ev->vma;
  unsigned int flags = ev->flags;
-    struct drm_i915_gem_object *obj = vma->obj;
  -    assert_vma_held(vma);
+    if (!(flags & EXEC_OBJECT_CAPTURE))
+    continue;
  -    if (flags & EXEC_OBJECT_CAPTURE) {
+    vsnap = i915_vma_snapshot_alloc(GFP_KERNEL);
+    if (!vsnap)
+    continue;
+
+    i915_vma_snapshot_init(vsnap, vma, "user");
+    for_each_batch_create_order(eb, j) {
  struct i915_capture_list *capture;
  -    for_each_batch_create_order(eb, j) {
-    if (!eb->requests[j])
-    break;
+    capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+    if (!capture)
+    continue;
  -    capture = kmalloc(sizeof(*capture), GFP_KERNEL);
-    if (capture) {
-    

Re: [Intel-gfx] [PATCH v7] drm/i915: Update error capture code to avoid using the current vma state

2021-12-07 Thread John Harrison

On 11/29/2021 12:22, Thomas Hellström wrote:

With asynchronous migrations, the vma state may be several migrations
ahead of the state that matches the request we're capturing.
Address that by introducing an i915_vma_snapshot structure that
can be used to snapshot relevant state at request submission.
In order to make sure we access the correct memory, the snapshots take
references on relevant sg-tables and memory regions.

Also move the capture list allocation out of the fence signaling
critical path and use the CONFIG_DRM_I915_CAPTURE_ERROR define to
avoid compiling in members and functions used for error capture
when they're not used.

Finally, Introduce lockdep annotation.

v4:
- Break out the capture allocation mode change to a separate patch.
v5:
- Fix compilation error in the !CONFIG_DRM_I915_CAPTURE_ERROR case
   (kernel test robot)
v6:
- Use #if IS_ENABLED() instead of #ifdef to match driver style.
- Move yet another change of allocation mode to the separate patch.
- Commit message rework due to patch reordering.
v7:
- Adjust for removal of region refcounting.

Signed-off-by: Thomas Hellström 
Reviewed-by: Ramalingam C 
---
  drivers/gpu/drm/i915/Makefile |   1 +
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 135 +++---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |   8 +-
  drivers/gpu/drm/i915/i915_gpu_error.c | 176 +-
  drivers/gpu/drm/i915/i915_request.c   |  63 +--
  drivers/gpu/drm/i915/i915_request.h   |  20 +-
  drivers/gpu/drm/i915/i915_vma_snapshot.c  | 134 +
  drivers/gpu/drm/i915/i915_vma_snapshot.h  | 112 +++
  8 files changed, 554 insertions(+), 95 deletions(-)
  create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.c
  create mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 3f57e85d4846..3b5857da4123 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -173,6 +173,7 @@ i915-y += \
  i915_trace_points.o \
  i915_ttm_buddy_manager.o \
  i915_vma.o \
+ i915_vma_snapshot.o \
  intel_wopcm.o
  
  # general-purpose microcontroller (GuC) support

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 9f7c6ecadb90..6a0ed537c199 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -29,6 +29,7 @@
  #include "i915_gem_ioctls.h"
  #include "i915_trace.h"
  #include "i915_user_extensions.h"
+#include "i915_vma_snapshot.h"
  
  struct eb_vma {

struct i915_vma *vma;
@@ -307,11 +308,15 @@ struct i915_execbuffer {
  
  	struct eb_fence *fences;

unsigned long num_fences;
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+   struct i915_capture_list *capture_lists[MAX_ENGINE_INSTANCE + 1];
+#endif
  };
  
  static int eb_parse(struct i915_execbuffer *eb);

  static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle);
  static void eb_unpin_engine(struct i915_execbuffer *eb);
+static void eb_capture_release(struct i915_execbuffer *eb);
  
  static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)

  {
@@ -1054,6 +1059,7 @@ static void eb_release_vmas(struct i915_execbuffer *eb, 
bool final)
i915_vma_put(vma);
}
  
+	eb_capture_release(eb);

eb_unpin_engine(eb);
  }
  
@@ -1891,36 +1897,113 @@ eb_find_first_request_added(struct i915_execbuffer *eb)

return NULL;
  }
  
-static int eb_move_to_gpu(struct i915_execbuffer *eb)

+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
+/* Stage with GFP_KERNEL allocations before we enter the signaling critical 
path */
+static void eb_capture_stage(struct i915_execbuffer *eb)
  {
const unsigned int count = eb->buffer_count;
-   unsigned int i = count;
-   int err = 0, j;
+   unsigned int i = count, j;
+   struct i915_vma_snapshot *vsnap;
  
  	while (i--) {

struct eb_vma *ev = >vma[i];
struct i915_vma *vma = ev->vma;
unsigned int flags = ev->flags;
-   struct drm_i915_gem_object *obj = vma->obj;
  
-		assert_vma_held(vma);

+   if (!(flags & EXEC_OBJECT_CAPTURE))
+   continue;
  
-		if (flags & EXEC_OBJECT_CAPTURE) {

+   vsnap = i915_vma_snapshot_alloc(GFP_KERNEL);
+   if (!vsnap)
+   continue;
+
+   i915_vma_snapshot_init(vsnap, vma, "user");
+   for_each_batch_create_order(eb, j) {
struct i915_capture_list *capture;
  
-			for_each_batch_create_order(eb, j) {

-   if (!eb->requests[j])
-   break;
+   capture = kmalloc(sizeof(*capture), GFP_KERNEL);
+   if (!capture)
+