offline_and_remove_memory() handles a single contiguous range.
Callers that manage a device composed of several ranges (dax/kmem)
currently have to call it in a loop, which gives up atomicity.
In addition to pushing rollback logic into the driver, the lack
of atomicity creates a race condition between system daemons trying
to manage the same resource:
- Manager 1: Offlines memory blocks. Removes device.
^^^^
- Manager 2: Detects offline memory blocks, re-onlines them.
Add offline_and_remove_memory_ranges(), which takes an array of ranges
and processes them as one operation under a single lock_device_hotplug():
- Phase 1 offlines every block of every range.
- Phase 2 removes the ranges only if all ranges are offline.
- If any offline fails, the whole operation is reverted.
This gives callers all-or-nothing semantics for the offline step, so a
failed or interrupted unplug leaves the device in a consistent state.
This also resolves the battling managers race - the second manager's
operation simply fails when the block is destroyed / cannot be onlined.
offline_and_remove_memory() becomes a thin wrapper that passes its single
range to the new helper, so the offline/rollback logic lives in one place.
Suggested-by: David Hildenbrand (Arm) <[email protected]>
Signed-off-by: Gregory Price <[email protected]>
---
include/linux/memory_hotplug.h | 8 +++
mm/memory_hotplug.c | 93 ++++++++++++++++++++++++----------
2 files changed, 73 insertions(+), 28 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ff3b865ea7e7..db10d50f30ae 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -268,6 +268,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned
long nr_pages,
extern int remove_memory(u64 start, u64 size);
extern void __remove_memory(u64 start, u64 size);
extern int offline_and_remove_memory(u64 start, u64 size);
+int offline_and_remove_memory_ranges(const struct range *ranges,
+ unsigned int nr_ranges);
#else
static inline void try_offline_node(int nid) {}
@@ -284,6 +286,12 @@ static inline int remove_memory(u64 start, u64 size)
}
static inline void __remove_memory(u64 start, u64 size) {}
+
+static inline int offline_and_remove_memory_ranges(const struct range *ranges,
+ unsigned int nr_ranges)
+{
+ return -EBUSY;
+}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a66346def504..3225364bec2f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2429,58 +2429,95 @@ static int try_reonline_memory_block(struct
memory_block *mem, void *arg)
*/
int offline_and_remove_memory(u64 start, u64 size)
{
- const unsigned long mb_count = size / memory_block_size_bytes();
+ struct range range = {
+ .start = start,
+ .end = start + size - 1,
+ };
+
+ return offline_and_remove_memory_ranges(&range, 1);
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+
+/**
+ * offline_and_remove_memory_ranges - offline and remove multiple memory ranges
+ * @ranges: array of physical address ranges to offline and remove
+ * @nr_ranges: number of entries in @ranges
+ *
+ * Offline and remove several memory ranges as one operation, serialized
+ * against other hotplug operations by a single lock_device_hotplug().
+ *
+ * This offlines all ranges before removing any of them. If offlining any
+ * range fails, the entire process is reverted and nothing is removed.
+ * This provides a fully atomic semantic for unplugging an entire device.
+ *
+ * Each range must be memory-block aligned in start and size.
+ *
+ * Return: 0 on success, negative errno otherwise. On failure no range has
+ * been removed.
+ */
+int offline_and_remove_memory_ranges(const struct range *ranges,
+ unsigned int nr_ranges)
+{
+ unsigned long mb_count = 0;
uint8_t *online_types, *tmp;
- int rc;
+ unsigned int i;
+ int rc = 0;
- if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
- !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ if (!ranges || !nr_ranges)
return -EINVAL;
+ for (i = 0; i < nr_ranges; i++) {
+ const u64 start = ranges[i].start;
+ const u64 size = range_len(&ranges[i]);
+
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ return -EINVAL;
+ mb_count += size / memory_block_size_bytes();
+ }
+
/*
- * We'll remember the old online type of each memory block, so we can
- * try to revert whatever we did when offlining one memory block fails
- * after offlining some others succeeded.
+ * Remember the old online type of every memory block across all ranges,
+ * so we can revert if offlining a later block fails. All entries start
+ * as MMOP_OFFLINE so blocks we never touched are skipped on rollback.
*/
online_types = kmalloc_array(mb_count, sizeof(*online_types),
GFP_KERNEL);
if (!online_types)
return -ENOMEM;
- /*
- * Initialize all states to MMOP_OFFLINE, so when we abort processing in
- * try_offline_memory_block(), we'll skip all unprocessed blocks in
- * try_reonline_memory_block().
- */
memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug();
+ /* Phase 1: offline every block in every range. */
tmp = online_types;
- rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
-
- /*
- * In case we succeeded to offline all memory, remove it.
- * This cannot fail as it cannot get onlined in the meantime.
- */
- if (!rc) {
- rc = try_remove_memory(start, size);
+ for (i = 0; i < nr_ranges; i++) {
+ rc = walk_memory_blocks(ranges[i].start, range_len(&ranges[i]),
+ &tmp, try_offline_memory_block);
if (rc)
- pr_err("%s: Failed to remove memory: %d", __func__, rc);
+ break;
}
- /*
- * Rollback what we did. While memory onlining might theoretically fail
- * (nacked by a notifier), it barely ever happens.
- */
+ /* If any failure occurred at all, rollback any changes and bail */
if (rc) {
tmp = online_types;
- walk_memory_blocks(start, size, &tmp,
- try_reonline_memory_block);
+ for (i = 0; i < nr_ranges; i++)
+ walk_memory_blocks(ranges[i].start,
+ range_len(&ranges[i]), &tmp,
+ try_reonline_memory_block);
+ goto out_unlock;
}
+
+ /* Phase 2: Remove. This should never fail holding the hotplug lock */
+ for (i = 0; i < nr_ranges; i++)
+ WARN_ON_ONCE(try_remove_memory(ranges[i].start,
+ range_len(&ranges[i])));
+
+out_unlock:
unlock_device_hotplug();
kfree(online_types);
return rc;
}
-EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+EXPORT_SYMBOL_GPL(offline_and_remove_memory_ranges);
#endif /* CONFIG_MEMORY_HOTREMOVE */
--
2.53.0-Meta