On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote:
> The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called
> while nouveau was runtime suspended, a deadlock would occur due to
> nouveau_fbcon_set_suspend also trying to obtain console_lock().
> 
> Fix this by delaying the drm_fb_helper_set_suspend call. Based on the
> i915 code (which was done for performance reasons though).
> 
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Signed-off-by: Peter Wu <peter at lekensteyn.nl>
> ---
> Tested on top of v4.7-rc5, the deadlock is gone.
> ---
>  drivers/gpu/drm/nouveau/nouveau_drm.c   |  4 +--
>  drivers/gpu/drm/nouveau/nouveau_drv.h   |  1 +
>  drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 
> ++++++++++++++++++++++++++++-----
>  drivers/gpu/drm/nouveau/nouveau_fbcon.h |  2 +-
>  4 files changed, 50 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c 
> b/drivers/gpu/drm/nouveau/nouveau_drm.c
> index 11f8dd9..f9a2c10 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
> @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime)
>  
>       if (dev->mode_config.num_crtc) {
>               NV_INFO(drm, "suspending console...\n");
> -             nouveau_fbcon_set_suspend(dev, 1);
> +             nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true);
>               NV_INFO(drm, "suspending display...\n");
>               ret = nouveau_display_suspend(dev, runtime);
>               if (ret)
> @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime)
>               NV_INFO(drm, "resuming display...\n");
>               nouveau_display_resume(dev, runtime);
>               NV_INFO(drm, "resuming console...\n");
> -             nouveau_fbcon_set_suspend(dev, 0);
> +             nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false);
>       }
>  
>       return 0;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
> b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index 822a021..a743d19 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -147,6 +147,7 @@ struct nouveau_drm {
>       struct nouveau_channel *channel;
>       struct nvkm_gpuobj *notify;
>       struct nouveau_fbdev *fbcon;
> +     struct work_struct fbdev_suspend_work;
>       struct nvif_object nvsw;
>       struct nvif_object ntfy;
>       struct nvif_notify flip;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c 
> b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> index d1f248f..089156a 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
> @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs 
> nouveau_fbcon_helper_funcs = {
>       .fb_probe = nouveau_fbcon_create,
>  };
>  
> +static void nouveau_fbcon_suspend_worker(struct work_struct *work)
> +{
> +     nouveau_fbcon_set_suspend(container_of(work,
> +                                            struct nouveau_drm,
> +                                            fbdev_suspend_work)->dev,
> +                               FBINFO_STATE_RUNNING,
> +                               true);
> +}
> +
>  void
> -nouveau_fbcon_set_suspend(struct drm_device *dev, int state)
> +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool 
> synchronous)
>  {
>       struct nouveau_drm *drm = nouveau_drm(dev);
> -     if (drm->fbcon) {
> -             console_lock();
> -             if (state == FBINFO_STATE_RUNNING)
> -                     nouveau_fbcon_accel_restore(dev);
> -             drm_fb_helper_set_suspend(&drm->fbcon->helper, state);
> +     if (!drm->fbcon)
> +             return;
> +
> +     if (synchronous) {
> +             /* Flush any pending work to turn the console on, and then
> +              * wait to turn it off. It must be synchronous as we are
> +              * about to suspend or unload the driver.
> +              *
> +              * Note that from within the work-handler, we cannot flush
> +              * ourselves, so only flush outstanding work upon suspend!
> +              */
>               if (state != FBINFO_STATE_RUNNING)
> -                     nouveau_fbcon_accel_save_disable(dev);
> -             console_unlock();
> +                     flush_work(&drm->fbdev_suspend_work);
> +             console_lock();
> +     } else {
> +             /*
> +              * The console lock can be pretty contented on resume due
> +              * to all the printk activity.  Try to keep it out of the hot
> +              * path of resume if possible.  This also prevents a deadlock
> +              * with FBIOPUT_CON2FBMAP.
> +              */
> +             WARN_ON(state != FBINFO_STATE_RUNNING);
> +             if (!console_trylock()) {
> +                     schedule_work(&drm->fbdev_suspend_work);
> +                     return;
> +             }
>       }
> +
> +     if (state == FBINFO_STATE_RUNNING)
> +             nouveau_fbcon_accel_restore(dev);
> +     drm_fb_helper_set_suspend(&drm->fbcon->helper, state);
> +     if (state != FBINFO_STATE_RUNNING)
> +             nouveau_fbcon_accel_save_disable(dev);
> +     console_unlock();
>  }
>  
>  int
> @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev)
>       fbcon->dev = dev;
>       drm->fbcon = fbcon;
>  
> +     INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker);
> +
>       drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs);
>  
>       ret = drm_fb_helper_init(dev, &fbcon->helper,
> @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev)
>       if (!drm->fbcon)
>               return;
>  
> +     flush_work(&drm->fbdev_suspend_work);

Hmm, since suspend_work can theorectically rearm itself, this should be
cancel_work_sync().

The copy'n'paste of the code looks fine, so (other than the bug copied
across):

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>

Whether you can convince the maintainers on the basis of this being a
deadlock fix is another matter...

I did test this patch, since I see a livelock on resume, but not the
same console deadlock. Just in case anyone is interested:

Jul 13 17:05:59 acer kernel: [24873.945839] NMI watchdog: BUG: soft lockup - 
CPU#2 stuck for 22s! [kworker/2:1:8370]
Jul 13 17:05:59 acer kernel: [24873.946563] Modules linked in: rfcomm drbg 
ansi_cprng ctr ccm arc4 bnep ath10k_pci ath10k_core snd_hda_codec_hdmi 
snd_hda_codec_realtek ath snd_hda_co
dec_generic snd_hda_intel mac80211 snd_hda_codec binfmt_misc snd_hda_core 
nls_iso8859_1 snd_hwdep btusb btrtl snd_pcm btbcm rtsx_usb_ms btintel 
x86_pkg_temp_thermal uvcvideo acer_wmi
 intel_powerclamp memstick snd_seq_midi bluetooth sparse_keymap 
snd_seq_midi_event coretemp videobuf2_vmalloc videobuf2_memops snd_rawmidi 
kvm_intel videobuf2_v4l2 kvm cfg80211 video
buf2_core snd_seq videodev irqbypass snd_seq_device snd_timer media 
crct10dif_pclmul snd crc32_pclmul hid_multitouch ghash_clmulni_intel 
aesni_intel joydev aes_x86_64 lrw gf128mul gl
ue_helper ablk_helper cryptd soundcore ie31200_edac edac_core mei_me shpchp mei 
input_leds acpi_als serio_raw kfifo_buf lpc_ich industrialio soc_button_array 
mac_hid parport_pc ppdev
 lp parport autofs4 btrfs xor raid6_pq dm_mirror dm_region_hash dm_log nouveau 
rtsx_usb_sdmmc rtsx_usb hid_generic usbhid hid i915 broadcom bcm_phy_lib 
mxm_wmi ttm i2c_algo_bit drm_k
ms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm tg3 ahci ptp 
libahci pps_core video wmi fjes
Jul 13 17:05:59 acer kernel: [24873.946598] CPU: 2 PID: 8370 Comm: kworker/2:1 
Tainted: G             L  4.7.0-rc6+ #4
Jul 13 17:05:59 acer kernel: [24873.946599] Hardware name: Acer Aspire 
VN7-791G/Aspire VN7-791G, BIOS V1.11 01/09/2015
Jul 13 17:05:59 acer kernel: [24873.946603] Workqueue: pm pm_runtime_work
Jul 13 17:05:59 acer kernel: [24873.946604] task: ffff880023c59c80 ti: 
ffff880024058000 task.ti: ffff880024058000
Jul 13 17:05:59 acer kernel: [24873.946605] RIP: 0010:[<ffffffff8140a6c0>]  
[<ffffffff8140a6c0>] ioread32+0x30/0x40
Jul 13 17:05:59 acer kernel: [24873.946608] RSP: 0018:ffff88002405baf0  EFLAGS: 
00000296
Jul 13 17:05:59 acer kernel: [24873.946609] RAX: 00000000ffffffff RBX: 
ffff88041734a000 RCX: 0000000000000018
Jul 13 17:05:59 acer kernel: [24873.946610] RDX: 0012230aadf99e58 RSI: 
ffffc9000410a014 RDI: ffffc90004009410
Jul 13 17:05:59 acer kernel: [24873.946610] RBP: ffff88002405bb10 R08: 
0000000000000009 R09: ffff880416ce0000
Jul 13 17:05:59 acer kernel: [24873.946611] R10: 000000000000000a R11: 
0000000000000001 R12: 00000000ffffffff
Jul 13 17:05:59 acer kernel: [24873.946612] R13: 00000000ffffffff R14: 
ffff880415de6600 R15: ffffffffffffffff
Jul 13 17:05:59 acer kernel: [24873.946613] FS:  0000000000000000(0000) 
GS:ffff88045f280000(0000) knlGS:0000000000000000
Jul 13 17:05:59 acer kernel: [24873.946614] CS:  0010 DS: 0000 ES: 0000 CR0: 
0000000080050033
Jul 13 17:05:59 acer kernel: [24873.946614] CR2: 00007f566a5ac010 CR3: 
0000000002e06000 CR4: 00000000001406e0
Jul 13 17:05:59 acer kernel: [24873.946615] Stack:
Jul 13 17:05:59 acer kernel: [24873.946616]  ffffffffc03bee71 ffff88041734a000 
0000000000000000 ffff880415de7908
Jul 13 17:05:59 acer kernel: [24873.946617]  ffff88002405bb20 ffffffffc03be9bf 
ffff88002405bb58 ffffffffc03b7340
Jul 13 17:05:59 acer kernel: [24873.946618]  ffff880415de7908 ffff88041734a000 
00000414ef5b0e40 0000000000000011
Jul 13 17:05:59 acer kernel: [24873.946620] Call Trace:
Jul 13 17:05:59 acer kernel: [24873.946645]  [<ffffffffc03bee71>] ? 
nv04_timer_read+0x51/0x70 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946661]  [<ffffffffc03be9bf>] 
nvkm_timer_read+0xf/0x20 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946676]  [<ffffffffc03b7340>] 
nvkm_pmu_init+0x50/0x450 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946685]  [<ffffffffc0371ac1>] 
nvkm_subdev_init+0x91/0x200 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946701]  [<ffffffffc03c2f26>] 
nvkm_device_init+0x146/0x280 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946715]  [<ffffffffc03c6a18>] 
nvkm_udevice_init+0x48/0x60 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946724]  [<ffffffffc0370440>] 
nvkm_object_init+0x40/0x190 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946733]  [<ffffffffc03704b4>] 
nvkm_object_init+0xb4/0x190 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946742]  [<ffffffffc036d56e>] 
nvkm_client_init+0xe/0x10 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946758]  [<ffffffffc040ac0e>] 
nvkm_client_resume+0xe/0x10 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946767]  [<ffffffffc036c7c7>] 
nvif_client_resume+0x17/0x20 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946782]  [<ffffffffc04082fb>] 
nouveau_do_resume+0x4b/0x130 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946797]  [<ffffffffc0408709>] 
nouveau_pmops_runtime_resume+0x79/0x120 [nouveau]
Jul 13 17:05:59 acer kernel: [24873.946800]  [<ffffffff814448eb>] 
pci_pm_runtime_resume+0x7b/0xa0
Jul 13 17:05:59 acer kernel: [24873.946801]  [<ffffffff815676d3>] 
__rpm_callback+0x33/0x70
Jul 13 17:05:59 acer kernel: [24873.946803]  [<ffffffff81444870>] ? 
pci_restore_standard_config+0x40/0x40
Jul 13 17:05:59 acer kernel: [24873.946804]  [<ffffffff81567734>] 
rpm_callback+0x24/0x80
Jul 13 17:05:59 acer kernel: [24873.946806]  [<ffffffff81444870>] ? 
pci_restore_standard_config+0x40/0x40
Jul 13 17:05:59 acer kernel: [24873.946807]  [<ffffffff81567ee1>] 
rpm_resume+0x491/0x690
Jul 13 17:05:59 acer kernel: [24873.946808]  [<ffffffff81568f08>] 
pm_runtime_work+0x58/0xa0
Jul 13 17:05:59 acer kernel: [24873.946811]  [<ffffffff8109adbb>] 
process_one_work+0x16b/0x480
Jul 13 17:05:59 acer kernel: [24873.946812]  [<ffffffff8109b11b>] 
worker_thread+0x4b/0x500
Jul 13 17:05:59 acer kernel: [24873.946814]  [<ffffffff8109b0d0>] ? 
process_one_work+0x480/0x480
Jul 13 17:05:59 acer kernel: [24873.946815]  [<ffffffff8109b0d0>] ? 
process_one_work+0x480/0x480
Jul 13 17:05:59 acer kernel: [24873.946816]  [<ffffffff810a1348>] 
kthread+0xd8/0xf0
Jul 13 17:05:59 acer kernel: [24873.946818]  [<ffffffff81845fdf>] 
ret_from_fork+0x1f/0x40
Jul 13 17:05:59 acer kernel: [24873.946819]  [<ffffffff810a1270>] ? 
kthread_create_on_node+0x1a0/0x1a0
Jul 13 17:05:59 acer kernel: [24873.946820] Code: 03 00 77 25 48 81 ff 00 00 01 
00 76 05 0f b7 d7 ed c3 55 48 c7 c6 e4 36 cc 81 48 89 e5 e8 19 ff ff ff b8 ff 
ff ff ff 5d c3 8b 07 <c3
> 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 81 fe ff ff 
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

Reply via email to