Re: Kernel crash/Null pointer dereference on vblank

Chunming Zhou Thu, 23 Nov 2017 02:08:04 -0800

See the attached email, they fixed same issue, each of them is ok to fixyour issue, your calltrace is same as the second.

We should already push the first patch in early time, could you check ifthe first patch is in your branch?


Regards,

David Zhou


On 2017年11月23日 16:31, Johannes Hirte wrote:

On 2017 Nov 23, Chunming Zhou wrote:

Which driver are you using?

I guess your driver is a bit old, the issue should be fixed before.

This was with git master from Linus. But even with the latest changes
from agd5f/drm-next-4.15 both use-after-free still persist. If there are
fixes for this, they're not available for upstream.

--- Begin Message ---

Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:

Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
amdgpu_ctx (and the entity inside it) were already deallocated from
amdgpu_cs_parser_fini.

Fix: Save job's priority on it's creation instead of accessing it from
s_entity later on.

Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>


Reviewed-by: Christian König <christian.koe...@amd.com>

---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 ++++++++++++---------------
  4 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index f7fceb6..a760b6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
        job->uf_sequence = seq;

amdgpu_job_free_resources(job);

-       amdgpu_ring_priority_get(job->ring,
-                                amd_sched_get_job_priority(&job->base));
+       amdgpu_ring_priority_get(job->ring, job->base.s_priority);

trace_amdgpu_cs_ioctl(job);

        amd_sched_entity_push_job(&job->base);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 0cfc68d..a58e3c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct amd_sched_job *s_job)
  {
        struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, base);

- amdgpu_ring_priority_put(job->ring, amd_sched_get_job_priority(s_job));

+       amdgpu_ring_priority_put(job->ring, s_job->s_priority);
        dma_fence_put(job->fence);
        amdgpu_sync_free(&job->sync);
        amdgpu_sync_free(&job->dep_sync);
@@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct 
amdgpu_ring *ring,
        job->fence_ctx = entity->fence_context;
        *f = dma_fence_get(&job->base.s_fence->finished);
        amdgpu_job_free_resources(job);
-       amdgpu_ring_priority_get(job->ring,
-                                amd_sched_get_job_priority(&job->base));
+       amdgpu_ring_priority_get(job->ring, job->base.s_priority);
        amd_sched_entity_push_job(&job->base);

return 0;

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index e4d3b4e..1bbbce2 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
  {
        job->sched = sched;
        job->s_entity = entity;
+       job->s_priority = entity->rq - sched->sched_rq;
        job->s_fence = amd_sched_fence_create(entity, owner);
        if (!job->s_fence)
                return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 52c8e54..3f75b45 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -30,6 +30,19 @@
  struct amd_gpu_scheduler;
  struct amd_sched_rq;

+enum amd_sched_priority {

+       AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_NORMAL,
+       AMD_SCHED_PRIORITY_HIGH_SW,
+       AMD_SCHED_PRIORITY_HIGH_HW,
+       AMD_SCHED_PRIORITY_KERNEL,
+       AMD_SCHED_PRIORITY_MAX,
+       AMD_SCHED_PRIORITY_INVALID = -1,
+       AMD_SCHED_PRIORITY_UNSET = -2
+};
+
+
  /**
   * A scheduler entity is a wrapper around a job queue or a group
   * of other entities. Entities take turns emitting jobs from their
@@ -83,6 +96,7 @@ struct amd_sched_job {
        struct delayed_work             work_tdr;
        uint64_t                        id;
        atomic_t karma;
+       enum amd_sched_priority s_priority;
  };

extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;

@@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
        void (*free_job)(struct amd_sched_job *sched_job);
  };

-enum amd_sched_priority {

-       AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_NORMAL,
-       AMD_SCHED_PRIORITY_HIGH_SW,
-       AMD_SCHED_PRIORITY_HIGH_HW,
-       AMD_SCHED_PRIORITY_KERNEL,
-       AMD_SCHED_PRIORITY_MAX,
-       AMD_SCHED_PRIORITY_INVALID = -1,
-       AMD_SCHED_PRIORITY_UNSET = -2
-};
-
  /**
   * One scheduler is implemented for each hardware ring
  */
@@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct dma_fence* 
fence,
                                    struct amd_sched_entity *entity);
  void amd_sched_job_kickout(struct amd_sched_job *s_job);

-static inline enum amd_sched_priority

-amd_sched_get_job_priority(struct amd_sched_job *job)
-{
-       return (job->s_entity->rq - job->sched->sched_rq);
-}
-
  #endif



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

--- End Message ---

--- Begin Message ---

Hi Monk,

You can enable KASAN to catch use-after-free case by 'CONFIG_KASAN = y',which calltrace really is very good and obvious.


Regards,

David Zhou


On 2017年10月24日 18:06, Liu, Monk wrote:

Christian

Actually there are more wild pointer issue for entity in scheduler's main 
routine ....


See the message I replied to David

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of 
Christian K?nig
Sent: 2017年10月24日 18:01
To: Zhou, David(ChunMing) <david1.z...@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/scheduler: fix one used-after-free case for 
job->s_entity

Andrey already submitted a fix for this a few days ago.

Christian.

Am 24.10.2017 um 11:55 schrieb Chunming Zhou:

The s_entity presented process could already be closed when calling 
amdgpu_job_free_cb.
the s_entity will be buggy pointer after it's freed. See below calltrace:

[  355.616964]
==================================================================
[  355.617191] BUG: KASAN: use-after-free in
amdgpu_job_free_cb+0x2f/0xc0 [amdgpu] [  355.617197] Read of size 8 at
addr ffff88039d593c40 by task kworker/9:1/100

[  355.617206] CPU: 9 PID: 100 Comm: kworker/9:1 Not tainted
4.13.0-custom #1 [  355.617208] Hardware name: Gigabyte Technology
Co., Ltd. Default string/X99P-SLI-CF, BIOS F23 07/22/2016 [
355.617342] Workqueue: events amd_sched_job_finish [amdgpu] [  355.617344] Call 
Trace:
[  355.617351]  dump_stack+0x63/0x8d
[  355.617356]  print_address_description+0x70/0x290
[  355.617474]  ? amdgpu_job_free_cb+0x2f/0xc0 [amdgpu] [  355.617477]
kasan_report+0x265/0x350 [  355.617479]  __asan_load8+0x54/0x90 [
355.617603]  amdgpu_job_free_cb+0x2f/0xc0 [amdgpu] [  355.617721]
amd_sched_job_finish+0x161/0x180 [amdgpu] [  355.617725]
process_one_work+0x2ab/0x700 [  355.617727]  worker_thread+0x90/0x720
[  355.617731]  kthread+0x18c/0x1e0 [  355.617732]  ?
process_one_work+0x700/0x700 [  355.617735]  ?
kthread_create_on_node+0xb0/0xb0 [  355.617738]
ret_from_fork+0x25/0x30

[  355.617742] Allocated by task 1347:
[  355.617747]  save_stack_trace+0x1b/0x20 [  355.617749]
save_stack+0x46/0xd0 [  355.617751]  kasan_kmalloc+0xad/0xe0 [
355.617753]  kmem_cache_alloc_trace+0xef/0x200 [  355.617853]
amdgpu_driver_open_kms+0x98/0x290 [amdgpu] [  355.617883]
drm_open+0x38c/0x6e0 [drm] [  355.617908]  drm_stub_open+0x144/0x1b0
[drm] [  355.617911]  chrdev_open+0x180/0x320 [  355.617913]
do_dentry_open+0x3a2/0x570 [  355.617915]  vfs_open+0x86/0xe0 [
355.617918]  path_openat+0x49e/0x1db0 [  355.617919]
do_filp_open+0x11c/0x1a0 [  355.617921]  do_sys_open+0x16f/0x2a0 [
355.617923]  SyS_open+0x1e/0x20 [  355.617926]
do_syscall_64+0xea/0x210 [  355.617928]
return_from_SYSCALL_64+0x0/0x6a

[  355.617931] Freed by task 1347:
[  355.617934]  save_stack_trace+0x1b/0x20 [  355.617936]
save_stack+0x46/0xd0 [  355.617937]  kasan_slab_free+0x70/0xc0 [
355.617939]  kfree+0x9d/0x1c0 [  355.618038]
amdgpu_driver_postclose_kms+0x1bc/0x3e0 [amdgpu] [  355.618063]
drm_release+0x454/0x610 [drm] [  355.618065]  __fput+0x177/0x350 [
355.618066]  ____fput+0xe/0x10 [  355.618068]  task_work_run+0xa0/0xc0
[  355.618070]  do_exit+0x456/0x1320 [  355.618072]
do_group_exit+0x86/0x130 [  355.618074]  SyS_exit_group+0x1d/0x20 [
355.618076]  do_syscall_64+0xea/0x210 [  355.618078]
return_from_SYSCALL_64+0x0/0x6a

[  355.618081] The buggy address belongs to the object at ffff88039d593b80
                  which belongs to the cache kmalloc-2048 of size 2048
[  355.618085] The buggy address is located 192 bytes inside of
                  2048-byte region [ffff88039d593b80, ffff88039d594380)
[  355.618087] The buggy address belongs to the page:
[  355.618091] page:ffffea000e756400 count:1 mapcount:0 mapping:          
(null) index:0x0 compound_mapcount: 0
[  355.618095] flags: 0x2ffff0000008100(slab|head) [  355.618099] raw:
02ffff0000008100 0000000000000000 0000000000000000 00000001000f000f [
355.618103] raw: ffffea000edb0600 0000000200000002 ffff8803bfc0ea00
0000000000000000 [  355.618105] page dumped because: kasan: bad access
detected

[  355.618108] Memory state around the buggy address:
[  355.618110]  ffff88039d593b00: fc fc fc fc fc fc fc fc fc fc fc fc
fc fc fc fc [  355.618113]  ffff88039d593b80: fb fb fb fb fb fb fb fb
fb fb fb fb fb fb fb fb [  355.618116] >ffff88039d593c00: fb fb fb fb fb fb fb 
fb fb fb fb fb fb fb fb fb
[  355.618117]                                            ^
[  355.618120]  ffff88039d593c80: fb fb fb fb fb fb fb fb fb fb fb fb
fb fb fb fb [  355.618122]  ffff88039d593d00: fb fb fb fb fb fb fb fb
fb fb fb fb fb fb fb fb [  355.618124]
==================================================================
[  355.618126] Disabling lock debugging due to kernel taint

Change-Id: I8ff7122796b8cd16fc26e9c40e8d4c8153d67e0c
Signed-off-by: Chunming Zhou <david1.z...@amd.com>
---
   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 27 
++++++++++++++-------------
   2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 007fdbd..8101ed7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -535,6 +535,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
        if (!job->s_fence)
                return -ENOMEM;
        job->id = atomic64_inc_return(&sched->job_id_count);
+       job->priority = job->s_entity->rq - job->sched->sched_rq;

INIT_WORK(&job->finish_work, amd_sched_job_finish);

        INIT_LIST_HEAD(&job->node);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index e21299c..8808eb1 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -77,6 +77,18 @@ struct amd_sched_fence {
        void                            *owner;
   };

+enum amd_sched_priority {

+       AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_NORMAL,
+       AMD_SCHED_PRIORITY_HIGH_SW,
+       AMD_SCHED_PRIORITY_HIGH_HW,
+       AMD_SCHED_PRIORITY_KERNEL,
+       AMD_SCHED_PRIORITY_MAX,
+       AMD_SCHED_PRIORITY_INVALID = -1,
+       AMD_SCHED_PRIORITY_UNSET = -2
+};
+
   struct amd_sched_job {
        struct amd_gpu_scheduler        *sched;
        struct amd_sched_entity         *s_entity;
@@ -87,6 +99,7 @@ struct amd_sched_job {
        struct delayed_work             work_tdr;
        uint64_t                        id;
        atomic_t karma;
+       enum amd_sched_priority         priority;
   };

extern const struct dma_fence_ops amd_sched_fence_ops_scheduled; @@

-118,18 +131,6 @@ struct amd_sched_backend_ops {
        void (*free_job)(struct amd_sched_job *sched_job);
   };

-enum amd_sched_priority {

-       AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_NORMAL,
-       AMD_SCHED_PRIORITY_HIGH_SW,
-       AMD_SCHED_PRIORITY_HIGH_HW,
-       AMD_SCHED_PRIORITY_KERNEL,
-       AMD_SCHED_PRIORITY_MAX,
-       AMD_SCHED_PRIORITY_INVALID = -1,
-       AMD_SCHED_PRIORITY_UNSET = -2
-};
-
   /**
    * One scheduler is implemented for each hardware ring
   */
@@ -183,7 +184,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job);
   static inline enum amd_sched_priority
   amd_sched_get_job_priority(struct amd_sched_job *job)
   {
-       return (job->s_entity->rq - job->sched->sched_rq);
+       return job->priority;
   }

#endif


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

--- End Message ---

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: Kernel crash/Null pointer dereference on vblank

Reply via email to