[Intel-gfx] [PATCH] drm/i915/selftests: Follow up on increase timeout in i915_gem_contexts selftests

2021-12-06 Thread Bruce Chang
Follow up on commit 5e076529e265 ("drm/i915/selftests: Increase timeout in
i915_gem_contexts selftests")

So we went from 200 msec to 1sec in that commit, and now we are going to
10sec as timeout.

Signed-off-by: Bruce Chang 
Reviewed-by: Matthew Brost 
Cc: John Harrison 
---
 drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index b32f7fed2d9c..21b71568cd5f 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -88,7 +88,7 @@ static int live_nop_switch(void *arg)
rq = i915_request_get(this);
i915_request_add(this);
}
-   if (i915_request_wait(rq, 0, HZ) < 0) {
+   if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
pr_err("Failed to populated %d contexts\n", nctx);
intel_gt_set_wedged(>gt);
i915_request_put(rq);
-- 
2.21.3



[Intel-gfx] [PATCH] drm/i915/selftests: Follow up on increase timeout in i915_gem_contexts selftests

2021-12-03 Thread Bruce Chang
Follow up on patch https://patchwork.freedesktop.org/patch/446832/

Different platforms will take a bit longer while GuC is enabled, so
increase the timeout and also add some margin in i915_gem_context
selftest.

Signed-off-by: Bruce Chang 
Cc: Matthew Brost 
Cc: John Harrison 
---
 drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index b32f7fed2d9c..ae33e8c705da 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -88,7 +88,7 @@ static int live_nop_switch(void *arg)
rq = i915_request_get(this);
i915_request_add(this);
}
-   if (i915_request_wait(rq, 0, HZ) < 0) {
+   if (i915_request_wait(rq, 0, 10*HZ) < 0) {
pr_err("Failed to populated %d contexts\n", nctx);
intel_gt_set_wedged(>gt);
i915_request_put(rq);
-- 
2.21.3



[Intel-gfx] [PATCH] drm/i915: Fix a bug calling sleep function in atomic context

2019-11-13 Thread Bruce Chang
below is the call trace when this issue is hit

<3> [113.316247] BUG: sleeping function called from invalid context at 
mm/page_alloc.c:4653
<3> [113.318190] in_atomic(): 1, irqs_disabled(): 0, pid: 678, name: 
debugfs_test
<4> [113.319900] no locks held by debugfs_test/678.
<3> [113.321002] Preemption disabled at:
<4> [113.321130] [] i915_error_object_create+0x494/0x610 
[i915]
<4> [113.327259] Call Trace:
<4> [113.327871] dump_stack+0x67/0x9b
<4> [113.328683] ___might_sleep+0x167/0x250
<4> [113.329618] __alloc_pages_nodemask+0x26b/0x1110
<4> [113.330731] ? ___slab_alloc.constprop.34+0x21c/0x380
<4> [113.331943] ? ___slab_alloc.constprop.34+0x21c/0x380
<4> [113.333169] ? __slab_alloc.isra.28.constprop.33+0x4d/0x70
<4> [113.334614] pool_alloc.constprop.19+0x14/0x60 [i915]
<4> [113.335951] compress_page+0x7c/0x100 [i915]
<4> [113.337110] i915_error_object_create+0x4bd/0x610 [i915]
<4> [113.338515] i915_capture_gpu_state+0x384/0x1680 [i915]
<4> [113.339771] ? __lock_acquire+0x4ac/0x1e90
<4> [113.340785] ? _raw_spin_lock_irqsave_nested+0x1/0x50
<4> [113.342127] i915_gpu_info_open+0x44/0x70 [i915]
<4> [113.343243] full_proxy_open+0x139/0x1b0
<4> [113.344196] ? open_proxy_open+0xc0/0xc0
<4> [113.345149] do_dentry_open+0x1ce/0x3a0
<4> [113.346084] path_openat+0x4c9/0xac0
<4> [113.346967] do_filp_open+0x96/0x110
<4> [113.347848] ? __alloc_fd+0xe0/0x1f0
<4> [113.348736] ? do_sys_open+0x1b8/0x250
<4> [113.349647] do_sys_open+0x1b8/0x250
<4> [113.350526] do_syscall_64+0x55/0x1c0
<4> [113.351418] entry_SYSCALL_64_after_hwframe+0x49/0xbe

After the io_mapping_map_atomic_wc/kmap_atomic, the kernel enters atomic context
but after that, compress_page calls pool_alloc with GFP_KERNEL flag which can
potentially go to sleep.

In order to fix this issue, we either
1) not enter into atomic context, i.e., to use non atomic version of
functions like io_mapping_map_wc/kmap,
or
    2) make compress_page run in atomic context.

To follow current design not to run compression in atomic context, so,
use 1) above in this patch.

Signed-off-by: Bruce Chang 
Reviewed-by: Brian Welty 
Fixes: 895d8ebeaa924 ("drm/i915: error capture with no ggtt slot")
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f2f266f26af..7118ecb7f144 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1007,67 +1007,67 @@ i915_error_object_create(struct drm_i915_private *i915,
compress->wc = i915_gem_object_is_lmem(vma->obj) ||
   drm_mm_node_allocated(>error_capture);
 
ret = -EINVAL;
if (drm_mm_node_allocated(>error_capture)) {
void __iomem *s;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
ggtt->vm.insert_page(>vm, dma, slot,
 I915_CACHE_NONE, 0);
 
s = io_mapping_map_wc(>iomap, slot, PAGE_SIZE);
ret = compress_page(compress, (void  __force *)s, dst);
io_mapping_unmap(s);
if (ret)
break;
}
} else if (i915_gem_object_is_lmem(vma->obj)) {
struct intel_memory_region *mem = vma->obj->mm.region;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
void __iomem *s;
 
-   s = io_mapping_map_atomic_wc(>iomap, dma);
+   s = io_mapping_map_wc(>iomap, dma, PAGE_SIZE);
ret = compress_page(compress, (void __force *)s, dst);
-   io_mapping_unmap_atomic(s);
+   io_mapping_unmap(s);
if (ret)
break;
}
} else {
struct page *page;
 
for_each_sgt_page(page, iter, vma->pages) {
void *s;
 
drm_clflush_pages(, 1);
 
-   s = kmap_atomic(page);
+   s = kmap(page);
ret = compress_page(compress, s, dst);
-   kunmap_atomic(s);
+   kunmap(s);
 
drm_clflush_pages(, 1);
 
if (ret)
break;
}
}
 
if (ret || compress_flush(compress, dst)) {
while (dst->page_count--)
pool_free(>pool, dst->pages[dst->page_count]);
   

[Intel-gfx] [PATCH] drm/i915: Fix a bug calling sleep function in atomic context

2019-11-13 Thread Bruce Chang
below is the call trace when this issue is hit

<3> [113.316247] BUG: sleeping function called from invalid context at 
mm/page_alloc.c:4653
<3> [113.318190] in_atomic(): 1, irqs_disabled(): 0, pid: 678, name: 
debugfs_test
<4> [113.319900] no locks held by debugfs_test/678.
<3> [113.321002] Preemption disabled at:
<4> [113.321130] [] i915_error_object_create+0x494/0x610 
[i915]
<4> [113.327259] Call Trace:
<4> [113.327871] dump_stack+0x67/0x9b
<4> [113.328683] ___might_sleep+0x167/0x250
<4> [113.329618] __alloc_pages_nodemask+0x26b/0x1110
<4> [113.330731] ? ___slab_alloc.constprop.34+0x21c/0x380
<4> [113.331943] ? ___slab_alloc.constprop.34+0x21c/0x380
<4> [113.333169] ? __slab_alloc.isra.28.constprop.33+0x4d/0x70
<4> [113.334614] pool_alloc.constprop.19+0x14/0x60 [i915]
<4> [113.335951] compress_page+0x7c/0x100 [i915]
<4> [113.337110] i915_error_object_create+0x4bd/0x610 [i915]
<4> [113.338515] i915_capture_gpu_state+0x384/0x1680 [i915]
<4> [113.339771] ? __lock_acquire+0x4ac/0x1e90
<4> [113.340785] ? _raw_spin_lock_irqsave_nested+0x1/0x50
<4> [113.342127] i915_gpu_info_open+0x44/0x70 [i915]
<4> [113.343243] full_proxy_open+0x139/0x1b0
<4> [113.344196] ? open_proxy_open+0xc0/0xc0
<4> [113.345149] do_dentry_open+0x1ce/0x3a0
<4> [113.346084] path_openat+0x4c9/0xac0
<4> [113.346967] do_filp_open+0x96/0x110
<4> [113.347848] ? __alloc_fd+0xe0/0x1f0
<4> [113.348736] ? do_sys_open+0x1b8/0x250
<4> [113.349647] do_sys_open+0x1b8/0x250
<4> [113.350526] do_syscall_64+0x55/0x1c0
<4> [113.351418] entry_SYSCALL_64_after_hwframe+0x49/0xbe

After the io_mapping_map_atomic_wc/kmap_atomic, the kernel enters atomic context
but after that, compress_page calls pool_alloc with GFP_KERNEL flag which can
potentially go to sleep. When the kernel is in atomic context, sleeping is not
allowed. This is why this bug got triggered.

In order to fix this issue, we either
1) not enter into atomic context, i.e., to use non atomic version of
functions like io_mapping_map_wc/kmap,
or
2) make compress_page run in atomic context.

But it is not a good idea to run slow compression in atomic context, so,
1) above is preferred solution which is the implementation of this patch.

Signed-off-by: Bruce Chang 
Reviewed-by: Brian Welty 
Fixes: 895d8ebeaa924 ("drm/i915: error capture with no ggtt slot")
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f2f266f26af..7118ecb7f144 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1007,67 +1007,67 @@ i915_error_object_create(struct drm_i915_private *i915,
compress->wc = i915_gem_object_is_lmem(vma->obj) ||
   drm_mm_node_allocated(>error_capture);
 
ret = -EINVAL;
if (drm_mm_node_allocated(>error_capture)) {
void __iomem *s;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
ggtt->vm.insert_page(>vm, dma, slot,
 I915_CACHE_NONE, 0);
 
s = io_mapping_map_wc(>iomap, slot, PAGE_SIZE);
ret = compress_page(compress, (void  __force *)s, dst);
io_mapping_unmap(s);
if (ret)
break;
}
} else if (i915_gem_object_is_lmem(vma->obj)) {
struct intel_memory_region *mem = vma->obj->mm.region;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
void __iomem *s;
 
-   s = io_mapping_map_atomic_wc(>iomap, dma);
+   s = io_mapping_map_wc(>iomap, dma, PAGE_SIZE);
ret = compress_page(compress, (void __force *)s, dst);
-   io_mapping_unmap_atomic(s);
+   io_mapping_unmap(s);
if (ret)
break;
}
} else {
struct page *page;
 
for_each_sgt_page(page, iter, vma->pages) {
void *s;
 
drm_clflush_pages(, 1);
 
-   s = kmap_atomic(page);
+   s = kmap(page);
ret = compress_page(compress, s, dst);
-   kunmap_atomic(s);
+   kunmap(s);
 
drm_clflush_pages(, 1);
 
if (ret)
break;
}
}
 
if (ret || compress_flush(

[Intel-gfx] [PATCH] drm/i915: Fix a bug calling sleep function in atomic context

2019-11-13 Thread Bruce Chang
There are quite a few reports regarding "BUG: sleeping function called from
invalid context at mm/page_alloc.c"

Basically after the io_mapping_map_atomic_wc/kmap_atomic, it enters atomic
context, but compress_page cannot be called in atomic context as it will
call pool_alloc with GFP_KERNEL flag which can go to sleep. This is why
the bug got reported.

So, changed to non atomic version instead.

Signed-off-by: Bruce Chang 
Reviewed-by: Brian Welty 
Fixes: 895d8ebeaa924 ("drm/i915: error capture with no ggtt slot")
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f2f266f26af..7118ecb7f144 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1007,67 +1007,67 @@ i915_error_object_create(struct drm_i915_private *i915,
compress->wc = i915_gem_object_is_lmem(vma->obj) ||
   drm_mm_node_allocated(>error_capture);
 
ret = -EINVAL;
if (drm_mm_node_allocated(>error_capture)) {
void __iomem *s;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
ggtt->vm.insert_page(>vm, dma, slot,
 I915_CACHE_NONE, 0);
 
s = io_mapping_map_wc(>iomap, slot, PAGE_SIZE);
ret = compress_page(compress, (void  __force *)s, dst);
io_mapping_unmap(s);
if (ret)
break;
}
} else if (i915_gem_object_is_lmem(vma->obj)) {
struct intel_memory_region *mem = vma->obj->mm.region;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
void __iomem *s;
 
-   s = io_mapping_map_atomic_wc(>iomap, dma);
+   s = io_mapping_map_wc(>iomap, dma, PAGE_SIZE);
ret = compress_page(compress, (void __force *)s, dst);
-   io_mapping_unmap_atomic(s);
+   io_mapping_unmap(s);
if (ret)
break;
}
} else {
struct page *page;
 
for_each_sgt_page(page, iter, vma->pages) {
void *s;
 
drm_clflush_pages(, 1);
 
-   s = kmap_atomic(page);
+   s = kmap(page);
ret = compress_page(compress, s, dst);
-   kunmap_atomic(s);
+   kunmap(s);
 
drm_clflush_pages(, 1);
 
if (ret)
break;
}
}
 
if (ret || compress_flush(compress, dst)) {
while (dst->page_count--)
pool_free(>pool, dst->pages[dst->page_count]);
kfree(dst);
dst = NULL;
}
compress_finish(compress);
 
return dst;
 }
 
 /*
  * Generate a semi-unique error code. The code is not meant to have meaning, 
The
  * code's only purpose is to try to prevent false duplicated bug reports by
  * grossly estimating a GPU error state.
  *
  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
-- 
2.24.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH] drm/i915: Fix a bug calling sleep function in atomic context

2019-11-12 Thread Bruce Chang
There are quite a few reports regarding "BUG: sleeping function called from
invalid context at mm/page_alloc.c"

Basically after the io_mapping_map_atomic_wc/kmap_atomic, it enters atomic
context, but compress_page cannot be called in atomic context as it will
call pool_alloc with GFP_KERNEL flag which can go to sleep. This is why
the bug got reported.

So, changed to non atomic version instead.

Signed-off-by: Bruce Chang 
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f2f266f26af..7118ecb7f144 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1007,67 +1007,67 @@ i915_error_object_create(struct drm_i915_private *i915,
compress->wc = i915_gem_object_is_lmem(vma->obj) ||
   drm_mm_node_allocated(>error_capture);
 
ret = -EINVAL;
if (drm_mm_node_allocated(>error_capture)) {
void __iomem *s;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
ggtt->vm.insert_page(>vm, dma, slot,
 I915_CACHE_NONE, 0);
 
s = io_mapping_map_wc(>iomap, slot, PAGE_SIZE);
ret = compress_page(compress, (void  __force *)s, dst);
io_mapping_unmap(s);
if (ret)
break;
}
} else if (i915_gem_object_is_lmem(vma->obj)) {
struct intel_memory_region *mem = vma->obj->mm.region;
dma_addr_t dma;
 
for_each_sgt_daddr(dma, iter, vma->pages) {
void __iomem *s;
 
-   s = io_mapping_map_atomic_wc(>iomap, dma);
+   s = io_mapping_map_wc(>iomap, dma, PAGE_SIZE);
ret = compress_page(compress, (void __force *)s, dst);
-   io_mapping_unmap_atomic(s);
+   io_mapping_unmap(s);
if (ret)
break;
}
} else {
struct page *page;
 
for_each_sgt_page(page, iter, vma->pages) {
void *s;
 
drm_clflush_pages(, 1);
 
-   s = kmap_atomic(page);
+   s = kmap(page);
ret = compress_page(compress, s, dst);
-   kunmap_atomic(s);
+   kunmap(s);
 
drm_clflush_pages(, 1);
 
if (ret)
break;
}
}
 
if (ret || compress_flush(compress, dst)) {
while (dst->page_count--)
pool_free(>pool, dst->pages[dst->page_count]);
kfree(dst);
dst = NULL;
}
compress_finish(compress);
 
return dst;
 }
 
 /*
  * Generate a semi-unique error code. The code is not meant to have meaning, 
The
  * code's only purpose is to try to prevent false duplicated bug reports by
  * grossly estimating a GPU error state.
  *
  * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
-- 
2.24.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx