The following lockdep splat was observed while kernel auto-online a CXL
memory region:
[ 51.926183] ======================================================
[ 51.933441] WARNING: possible circular locking dependency detected
[ 51.940701] 6.17.0djtest+ #53 Tainted: G W
[ 51.947290] ------------------------------------------------------
[ 51.954553] systemd-udevd/3334 is trying to acquire lock:
[ 51.960938] ffffffff90346188 (hmem_resource_lock){+.+.}-{4:4}, at:
hmem_register_resource+0x31/0x50
[ 51.971429]
but task is already holding lock:
[ 51.978548] ffffffff90338890 ((node_chain).rwsem){++++}-{4:4}, at:
blocking_notifier_call_chain+0x2e/0x70
[ 51.989621]
which lock already depends on the new lock.
[ 51.999605]
the existing dependency chain (in reverse order) is:
[ 52.008539]
-> #6 ((node_chain).rwsem){++++}-{4:4}:
[ 52.016195] down_read+0x45/0x190
[ 52.020789] blocking_notifier_call_chain+0x2e/0x70
[ 52.027131] node_notify+0x1f/0x30
[ 52.031809] online_pages+0xc1/0x330
[ 52.036684] memory_subsys_online+0x22a/0x280
[ 52.042431] device_online+0x50/0x90
[ 52.047298] state_store+0x9b/0xa0
[ 52.051956] dev_attr_store+0x18/0x30
[ 52.056907] sysfs_kf_write+0x4e/0x70
[ 52.061854] kernfs_fop_write_iter+0x187/0x260
[ 52.067673] vfs_write+0x21f/0x590
[ 52.072313] ksys_write+0x73/0xf0
[ 52.076854] __x64_sys_write+0x1d/0x30
[ 52.081874] x64_sys_call+0x7d/0x1d80
[ 52.086797] do_syscall_64+0x6c/0x2f0
[ 52.091717] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 52.098198]
-> #5 (mem_hotplug_lock){++++}-{0:0}:
[ 52.105512] percpu_down_write+0x4b/0x260
[ 52.110825] try_online_node+0x21/0x50
[ 52.115844] cpu_up+0x43/0xd0
[ 52.119989] cpuhp_bringup_mask+0x60/0xa0
[ 52.125305] bringup_nonboot_cpus+0x76/0x110
[ 52.130912] smp_init+0x2e/0x90
[ 52.135235] kernel_init_freeable+0x19a/0x300
[ 52.140930] kernel_init+0x1e/0x140
[ 52.145635] ret_from_fork+0x159/0x200
[ 52.150633] ret_from_fork_asm+0x1a/0x30
[ 52.155826]
-> #4 (cpu_hotplug_lock){++++}-{0:0}:
[ 52.163081] __cpuhp_state_add_instance+0x51/0x200
[ 52.169238] iova_domain_init_rcaches+0x1ed/0x200
[ 52.175301] iommu_setup_dma_ops+0x1b4/0x500
[ 52.180877] bus_iommu_probe+0xd2/0x180
[ 52.185954] iommu_device_register+0x9f/0xe0
[ 52.191530] intel_iommu_init+0xd3b/0xf20
[ 52.196810] pci_iommu_init+0x16/0x40
[ 52.201695] do_one_initcall+0x5c/0x2d0
[ 52.206767] kernel_init_freeable+0x281/0x300
[ 52.212432] kernel_init+0x1e/0x140
[ 52.217109] ret_from_fork+0x159/0x200
[ 52.222082] ret_from_fork_asm+0x1a/0x30
[ 52.227253]
-> #3 (&group->mutex){+.+.}-{4:4}:
[ 52.234196] __mutex_lock+0xa9/0x11e0
[ 52.239066] mutex_lock_nested+0x1f/0x30
[ 52.244236] __iommu_probe_device+0x28c/0x5e0
[ 52.249893] probe_iommu_group+0x2f/0x50
[ 52.255064] bus_for_each_dev+0x7e/0xd0
[ 52.260126] bus_iommu_probe+0x3f/0x180
[ 52.265190] iommu_device_register+0x9f/0xe0
[ 52.270751] intel_iommu_init+0xd3b/0xf20
[ 52.276016] pci_iommu_init+0x16/0x40
[ 52.280892] do_one_initcall+0x5c/0x2d0
[ 52.285956] kernel_init_freeable+0x281/0x300
[ 52.291613] kernel_init+0x1e/0x140
[ 52.296284] ret_from_fork+0x159/0x200
[ 52.301253] ret_from_fork_asm+0x1a/0x30
[ 52.306421]
-> #2 (iommu_probe_device_lock){+.+.}-{4:4}:
[ 52.314333] __mutex_lock+0xa9/0x11e0
[ 52.319201] mutex_lock_nested+0x1f/0x30
[ 52.324372] iommu_probe_device+0x21/0x70
[ 52.329638] iommu_bus_notifier+0x2c/0x80
[ 52.334903] notifier_call_chain+0x4b/0x110
[ 52.340357] blocking_notifier_call_chain+0x4a/0x70
[ 52.346594] bus_notify+0x3b/0x50
[ 52.351079] device_add+0x65d/0x8b0
[ 52.355750] platform_device_add+0xf8/0x250
[ 52.361205] platform_device_register_full+0x154/0x1f0
[ 52.367739]
platform_device_register_simple.constprop.0.isra.0+0x37/0x50
[ 52.376119] efisubsys_init+0xaf/0x570
[ 52.381090] do_one_initcall+0x5c/0x2d0
[ 52.386152] kernel_init_freeable+0x281/0x300
[ 52.391809] kernel_init+0x1e/0x140
[ 52.396481] ret_from_fork+0x159/0x200
[ 52.401450] ret_from_fork_asm+0x1a/0x30
[ 52.406620]
-> #1 (&(&priv->bus_notifier)->rwsem){++++}-{4:4}:
[ 52.415109] down_read+0x45/0x190
[ 52.419593] blocking_notifier_call_chain+0x2e/0x70
[ 52.425828] bus_notify+0x3b/0x50
[ 52.430311] device_add+0x65d/0x8b0
[ 52.434981] platform_device_add+0xf8/0x250
[ 52.440435] __hmem_register_resource+0x70/0xc0
[ 52.446279] hmem_register_resource+0x3b/0x50
[ 52.451923] hmat_register_target+0x3c/0x190
[ 52.457488] hmat_init+0x13f/0x370
[ 52.462067] do_one_initcall+0x5c/0x2d0
[ 52.467132] kernel_init_freeable+0x281/0x300
[ 52.472790] kernel_init+0x1e/0x140
[ 52.477464] ret_from_fork+0x159/0x200
[ 52.482433] ret_from_fork_asm+0x1a/0x30
[ 52.487604]
-> #0 (hmem_resource_lock){+.+.}-{4:4}:
[ 52.495030] __lock_acquire+0x14a4/0x2290
[ 52.500290] lock_acquire+0xdd/0x2f0
[ 52.505070] __mutex_lock+0xa9/0x11e0
[ 52.509944] mutex_lock_nested+0x1f/0x30
[ 52.515115] hmem_register_resource+0x31/0x50
[ 52.520771] hmat_register_target+0x3c/0x190
[ 52.526319] hmat_callback+0x6b/0x80
[ 52.531098] notifier_call_chain+0x4b/0x110
[ 52.536552] blocking_notifier_call_chain+0x4a/0x70
[ 52.542788] node_notify+0x1f/0x30
[ 52.547369] online_pages+0x288/0x330
[ 52.552246] memory_subsys_online+0x22a/0x280
[ 52.557902] device_online+0x50/0x90
[ 52.562669] state_store+0x9b/0xa0
[ 52.567247] dev_attr_store+0x18/0x30
[ 52.572123] sysfs_kf_write+0x4e/0x70
[ 52.576998] kernfs_fop_write_iter+0x187/0x260
[ 52.582750] vfs_write+0x21f/0x590
[ 52.587327] ksys_write+0x73/0xf0
[ 52.591811] __x64_sys_write+0x1d/0x30
[ 52.596779] x64_sys_call+0x7d/0x1d80
[ 52.601653] do_syscall_64+0x6c/0x2f0
[ 52.606528] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 52.612968]
other info that might help us debug this:
[ 52.622356] Chain exists of:
hmem_resource_lock --> mem_hotplug_lock --> (node_chain).rwsem
[ 52.635550] Possible unsafe locking scenario:
[ 52.642495] CPU0 CPU1
[ 52.647752] ---- ----
[ 52.653014] rlock((node_chain).rwsem);
[ 52.657589] lock(mem_hotplug_lock);
[ 52.664701] lock((node_chain).rwsem);
[ 52.672015] lock(hmem_resource_lock);
[ 52.676497]
*** DEADLOCK ***
[ 52.683541] 8 locks held by systemd-udevd/3334:
[ 52.688801] #0: ff36b6d49fbf0410 (sb_writers#3){.+.+}-{0:0}, at:
ksys_write+0x73/0xf0
[ 52.697870] #1: ff36b6d4ece03a88 (&of->mutex#2){+.+.}-{4:4}, at:
kernfs_fop_write_iter+0x12c/0x260
[ 52.708210] #2: ff36b6d4ece1cbb8 (kn->active#62){.+.+}-{0:0}, at:
kernfs_fop_write_iter+0x141/0x260
[ 52.718645] #3: ffffffff90333cc8 (device_hotplug_lock){+.+.}-{4:4}, at:
lock_device_hotplug_sysfs+0x1b/0x50
[ 52.729863] #4: ff36b6d4ece4b108 (&dev->mutex){....}-{4:4}, at:
device_online+0x23/0x90
[ 52.739130] #5: ffffffff900664d0 (cpu_hotplug_lock){++++}-{0:0}, at:
mem_hotplug_begin+0x12/0x30
[ 52.749288] #6: ffffffff9024c810 (mem_hotplug_lock){++++}-{0:0}, at:
mem_hotplug_begin+0x1e/0x30
[ 52.759446] #7: ffffffff90338890 ((node_chain).rwsem){++++}-{4:4}, at:
blocking_notifier_call_chain+0x2e/0x70
[ 52.770860]
stack backtrace:
[ 52.776068] CPU: 0 UID: 0 PID: 3334 Comm: systemd-udevd Tainted: G W
6.17.0djtest+ #53 PREEMPT(voluntary)
[ 52.776071] Tainted: [W]=WARN
[ 52.776072] Hardware name: Intel Corporation AvenueCity/AvenueCity, BIOS
BHSDCRB1.IPC.3545.P03.2509232237 09/23/2025
[ 52.776073] Call Trace:
[ 52.776074] <TASK>
[ 52.776076] dump_stack_lvl+0x72/0xa0
[ 52.776080] dump_stack+0x14/0x1a
[ 52.776082] print_circular_bug.cold+0x188/0x1c6
[ 52.776084] check_noncircular+0x12f/0x160
[ 52.776087] ? __lock_acquire+0x486/0x2290
[ 52.776089] ? __lock_acquire+0x486/0x2290
[ 52.776091] __lock_acquire+0x14a4/0x2290
[ 52.776095] lock_acquire+0xdd/0x2f0
[ 52.776096] ? hmem_register_resource+0x31/0x50
[ 52.776100] ? hmem_register_resource+0x31/0x50
[ 52.776101] __mutex_lock+0xa9/0x11e0
[ 52.776104] ? hmem_register_resource+0x31/0x50
[ 52.776104] ? __kernfs_create_file+0xb5/0x110
[ 52.776110] mutex_lock_nested+0x1f/0x30
[ 52.776112] ? mutex_lock_nested+0x1f/0x30
[ 52.776114] hmem_register_resource+0x31/0x50
[ 52.776115] hmat_register_target+0x3c/0x190
[ 52.776119] hmat_callback+0x6b/0x80
[ 52.776120] notifier_call_chain+0x4b/0x110
[ 52.776123] blocking_notifier_call_chain+0x4a/0x70
[ 52.776125] node_notify+0x1f/0x30
[ 52.776126] online_pages+0x288/0x330
[ 52.776129] memory_subsys_online+0x22a/0x280
[ 52.776132] device_online+0x50/0x90
[ 52.776134] state_store+0x9b/0xa0
[ 52.776136] dev_attr_store+0x18/0x30
[ 52.776137] sysfs_kf_write+0x4e/0x70
[ 52.776139] kernfs_fop_write_iter+0x187/0x260
[ 52.776142] vfs_write+0x21f/0x590
[ 52.776146] ksys_write+0x73/0xf0
[ 52.776148] __x64_sys_write+0x1d/0x30
[ 52.776150] x64_sys_call+0x7d/0x1d80
[ 52.776152] do_syscall_64+0x6c/0x2f0
[ 52.776154] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 52.776156] RIP: 0033:0x7f11142fda57
[ 52.776158] Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00
f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00
f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[ 52.776160] RSP: 002b:00007ffd0bd530f8 EFLAGS: 00000246 ORIG_RAX:
0000000000000001
[ 52.776163] RAX: ffffffffffffffda RBX: 000000000000000e RCX: 00007f11142fda57
[ 52.776164] RDX: 000000000000000e RSI: 00007ffd0bd537c0 RDI: 0000000000000006
[ 52.776166] RBP: 00007ffd0bd537c0 R08: 00007f11143f70a0 R09: 00007ffd0bd53190
[ 52.776167] R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000e
[ 52.776168] R13: 000055814e03e780 R14: 000000000000000e R15: 00007f11143f69e0
[ 52.776171] </TASK>
The lock ordering can cause potential deadlock. There are instances
where hmem_resource_lock is taken after (node_chain).rwsem, and vice
versa. Narrow the scope of hmem_resource_lock in hmem_register_resource()
to avoid the circular locking dependency. The locking is only needed when
hmem_active needs to be protected.
Fixes: 7dab174e2e27 ("dax/hmem: Move hmem device registration to dax_hmem.ko")
Signed-off-by: Dave Jiang <[email protected]>
---
drivers/dax/hmem/device.c | 42 +++++++++++++++++++++++----------------
1 file changed, 25 insertions(+), 17 deletions(-)
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index f9e1a76a04a9..ab5977d75d1f 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -33,21 +33,37 @@ int walk_hmem_resources(struct device *host, walk_hmem_fn
fn)
}
EXPORT_SYMBOL_GPL(walk_hmem_resources);
-static void __hmem_register_resource(int target_nid, struct resource *res)
+static struct resource *hmem_request_resource(int target_nid,
+ struct resource *res)
{
- struct platform_device *pdev;
struct resource *new;
- int rc;
- new = __request_region(&hmem_active, res->start, resource_size(res), "",
- 0);
+ guard(mutex)(&hmem_resource_lock);
+ new = __request_region(&hmem_active, res->start,
+ resource_size(res), "", 0);
if (!new) {
pr_debug("hmem range %pr already active\n", res);
- return;
+ return ERR_PTR(-ENOMEM);
}
new->desc = target_nid;
+ return new;
+}
+
+void hmem_register_resource(int target_nid, struct resource *res)
+{
+ struct platform_device *pdev;
+ struct resource *new;
+ int rc;
+
+ if (nohmem)
+ return;
+
+ new = hmem_request_resource(target_nid, res);
+ if (IS_ERR(new))
+ return;
+
if (platform_initialized)
return;
@@ -58,20 +74,12 @@ static void __hmem_register_resource(int target_nid, struct
resource *res)
}
rc = platform_device_add(pdev);
- if (rc)
+ if (rc) {
platform_device_put(pdev);
- else
- platform_initialized = true;
-}
-
-void hmem_register_resource(int target_nid, struct resource *res)
-{
- if (nohmem)
return;
+ }
- mutex_lock(&hmem_resource_lock);
- __hmem_register_resource(target_nid, res);
- mutex_unlock(&hmem_resource_lock);
+ platform_initialized = true;
}
static __init int hmem_register_one(struct resource *res, void *data)
base-commit: e5f0a698b34ed76002dc5cff3804a61c80233a7a
--
2.51.0