On Mon, Dec 01, 2025 at 10:13:10PM +0100, Mikulas Patocka wrote: > There is reported 'scheduling while atomic' bug when using dm-snapshot on > real-time kernels. The reason for the bug is that the hlist_bl code does > preempt_disable() when taking the lock and the kernel attempts to take > other spinlocks while holding the hlist_bl lock. > > Fix this by converting a hlist_bl spinlock into a regular spinlock.
This looks correct, but it will either cause an increase in the memory used, or a possible performance hit due to lock contention on a smaller number of hash buckets, depending on whether the number of hash buckets calculated by the device/chunk size is larger than the new, smaller calc_max_buckets() limit. Since calc_max_buckets() limits the memory use to 2MB, I suspect that for most devices, this will result in fewer buckets (a 2G cow device with a 16K chunk size on a 64bit machine would hit the calc_max_buckets() limit of 262144 buckets before. Depending on how the kernel is configured, the limit is now likely half that, so cow devices of over 1G are likely to start having fewer buckets). I wonder if we should double the calc_max_buckets() memory limit, or perhaps play some #ifdef games so that only real-time kernels are effected by this change. -Ben > Signed-off-by: Mikulas Patocka <[email protected]> > > --- > drivers/md/dm-exception-store.h | 2 - > drivers/md/dm-snap.c | 73 > ++++++++++++++++++---------------------- > 2 files changed, 35 insertions(+), 40 deletions(-) > > Index: linux-2.6/drivers/md/dm-exception-store.h > =================================================================== > --- linux-2.6.orig/drivers/md/dm-exception-store.h 2023-09-05 > 14:45:58.000000000 +0200 > +++ linux-2.6/drivers/md/dm-exception-store.h 2025-12-01 16:46:21.000000000 > +0100 > @@ -29,7 +29,7 @@ typedef sector_t chunk_t; > * chunk within the device. > */ > struct dm_exception { > - struct hlist_bl_node hash_list; > + struct hlist_node hash_list; > > chunk_t old_chunk; > chunk_t new_chunk; > Index: linux-2.6/drivers/md/dm-snap.c > =================================================================== > --- linux-2.6.orig/drivers/md/dm-snap.c 2025-11-30 21:47:53.000000000 > +0100 > +++ linux-2.6/drivers/md/dm-snap.c 2025-12-01 21:48:42.000000000 +0100 > @@ -40,10 +40,15 @@ static const char dm_snapshot_merge_targ > #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ > (DM_TRACKED_CHUNK_HASH_SIZE - 1)) > > +struct dm_hlist_head { > + struct hlist_head head; > + spinlock_t lock; > +}; > + > struct dm_exception_table { > uint32_t hash_mask; > unsigned int hash_shift; > - struct hlist_bl_head *table; > + struct dm_hlist_head *table; > }; > > struct dm_snapshot { > @@ -628,8 +633,8 @@ static uint32_t exception_hash(struct dm > > /* Lock to protect access to the completed and pending exception hash > tables. */ > struct dm_exception_table_lock { > - struct hlist_bl_head *complete_slot; > - struct hlist_bl_head *pending_slot; > + spinlock_t *complete_slot; > + spinlock_t *pending_slot; > }; > > static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t > chunk, > @@ -638,20 +643,20 @@ static void dm_exception_table_lock_init > struct dm_exception_table *complete = &s->complete; > struct dm_exception_table *pending = &s->pending; > > - lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; > - lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; > + lock->complete_slot = &complete->table[exception_hash(complete, > chunk)].lock; > + lock->pending_slot = &pending->table[exception_hash(pending, > chunk)].lock; > } > > static void dm_exception_table_lock(struct dm_exception_table_lock *lock) > { > - hlist_bl_lock(lock->complete_slot); > - hlist_bl_lock(lock->pending_slot); > + spin_lock_nested(lock->complete_slot, 1); > + spin_lock_nested(lock->pending_slot, 2); > } > > static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) > { > - hlist_bl_unlock(lock->pending_slot); > - hlist_bl_unlock(lock->complete_slot); > + spin_unlock(lock->pending_slot); > + spin_unlock(lock->complete_slot); > } > > static int dm_exception_table_init(struct dm_exception_table *et, > @@ -661,13 +666,15 @@ static int dm_exception_table_init(struc > > et->hash_shift = hash_shift; > et->hash_mask = size - 1; > - et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), > + et->table = kvmalloc_array(size, sizeof(struct dm_hlist_head), > GFP_KERNEL); > if (!et->table) > return -ENOMEM; > > - for (i = 0; i < size; i++) > - INIT_HLIST_BL_HEAD(et->table + i); > + for (i = 0; i < size; i++) { > + INIT_HLIST_HEAD(&et->table[i].head); > + spin_lock_init(&et->table[i].lock); > + } > > return 0; > } > @@ -675,16 +682,17 @@ static int dm_exception_table_init(struc > static void dm_exception_table_exit(struct dm_exception_table *et, > struct kmem_cache *mem) > { > - struct hlist_bl_head *slot; > + struct dm_hlist_head *slot; > struct dm_exception *ex; > - struct hlist_bl_node *pos, *n; > + struct hlist_node *pos; > int i, size; > > size = et->hash_mask + 1; > for (i = 0; i < size; i++) { > slot = et->table + i; > > - hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) { > + hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) { > + hlist_del(&ex->hash_list); > kmem_cache_free(mem, ex); > cond_resched(); > } > @@ -700,7 +708,7 @@ static uint32_t exception_hash(struct dm > > static void dm_remove_exception(struct dm_exception *e) > { > - hlist_bl_del(&e->hash_list); > + hlist_del(&e->hash_list); > } > > /* > @@ -710,12 +718,11 @@ static void dm_remove_exception(struct d > static struct dm_exception *dm_lookup_exception(struct dm_exception_table > *et, > chunk_t chunk) > { > - struct hlist_bl_head *slot; > - struct hlist_bl_node *pos; > + struct hlist_head *slot; > struct dm_exception *e; > > - slot = &et->table[exception_hash(et, chunk)]; > - hlist_bl_for_each_entry(e, pos, slot, hash_list) > + slot = &et->table[exception_hash(et, chunk)].head; > + hlist_for_each_entry(e, slot, hash_list) > if (chunk >= e->old_chunk && > chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) > return e; > @@ -762,18 +769,17 @@ static void free_pending_exception(struc > static void dm_insert_exception(struct dm_exception_table *eh, > struct dm_exception *new_e) > { > - struct hlist_bl_head *l; > - struct hlist_bl_node *pos; > + struct hlist_head *l; > struct dm_exception *e = NULL; > > - l = &eh->table[exception_hash(eh, new_e->old_chunk)]; > + l = &eh->table[exception_hash(eh, new_e->old_chunk)].head; > > /* Add immediately if this table doesn't support consecutive chunks */ > if (!eh->hash_shift) > goto out; > > /* List is ordered by old_chunk */ > - hlist_bl_for_each_entry(e, pos, l, hash_list) { > + hlist_for_each_entry(e, l, hash_list) { > /* Insert after an existing chunk? */ > if (new_e->old_chunk == (e->old_chunk + > dm_consecutive_chunk_count(e) + 1) && > @@ -804,13 +810,13 @@ out: > * Either the table doesn't support consecutive chunks or slot > * l is empty. > */ > - hlist_bl_add_head(&new_e->hash_list, l); > + hlist_add_head(&new_e->hash_list, l); > } else if (new_e->old_chunk < e->old_chunk) { > /* Add before an existing exception */ > - hlist_bl_add_before(&new_e->hash_list, &e->hash_list); > + hlist_add_before(&new_e->hash_list, &e->hash_list); > } else { > /* Add to l's tail: e is the last exception in this slot */ > - hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); > + hlist_add_behind(&new_e->hash_list, &e->hash_list); > } > } > > @@ -820,7 +826,6 @@ out: > */ > static int dm_add_exception(void *context, chunk_t old, chunk_t new) > { > - struct dm_exception_table_lock lock; > struct dm_snapshot *s = context; > struct dm_exception *e; > > @@ -833,17 +838,7 @@ static int dm_add_exception(void *contex > /* Consecutive_count is implicitly initialised to zero */ > e->new_chunk = new; > > - /* > - * Although there is no need to lock access to the exception tables > - * here, if we don't then hlist_bl_add_head(), called by > - * dm_insert_exception(), will complain about accessing the > - * corresponding list without locking it first. > - */ > - dm_exception_table_lock_init(s, old, &lock); > - > - dm_exception_table_lock(&lock); > dm_insert_exception(&s->complete, e); > - dm_exception_table_unlock(&lock); > > return 0; > } > @@ -873,7 +868,7 @@ static int calc_max_buckets(void) > /* use a fixed size of 2MB */ > unsigned long mem = 2 * 1024 * 1024; > > - mem /= sizeof(struct hlist_bl_head); > + mem /= sizeof(struct dm_hlist_head); > > return mem; > }
