This patch adds support for Specific Purpose Memory (SPM) through the NUMA node configuration. When 'spm=on' is specified for a NUMA node, QEMU will:
1. Set the RAM_SPM flag in the RAM block of the corresponding memory region 2. Update the overlapping E820 RAM entries before adding E820_SOFT_RESERVED 3. Set the E820 type to E820_SOFT_RESERVED for this memory region This allows guest operating systems to recognize the memory as soft reserved memory, which can be used for device-specific memory management without E820 table conflicts. Usage: -numa node,nodeid=0,memdev=m1,spm=on Signed-off-by: fanhuang <[email protected]> --- hw/core/numa.c | 3 ++ hw/i386/e820_memory_layout.c | 73 ++++++++++++++++++++++++++++++++++++ hw/i386/e820_memory_layout.h | 2 + hw/i386/pc.c | 37 ++++++++++++++++++ include/exec/cpu-common.h | 1 + include/system/memory.h | 3 ++ include/system/numa.h | 1 + qapi/machine.json | 6 +++ system/physmem.c | 7 +++- 9 files changed, 132 insertions(+), 1 deletion(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 218576f745..e680130460 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -163,6 +163,9 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); } + /* Store spm configuration for later processing */ + numa_info[nodenr].is_spm = node->has_spm && node->spm; + numa_info[nodenr].present = true; max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); ms->numa_state->num_nodes++; diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c index 3e848fb69c..5b090ac6df 100644 --- a/hw/i386/e820_memory_layout.c +++ b/hw/i386/e820_memory_layout.c @@ -46,3 +46,76 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length) } return false; } + +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type) +{ + uint64_t end = start + length; + bool updated = false; + assert(!e820_done); + + /* For E820_SOFT_RESERVED, validate range is within E820_RAM */ + if (new_type == E820_SOFT_RESERVED) { + bool range_in_ram = false; + for (size_t j = 0; j < e820_entries; j++) { + uint64_t ram_start = le64_to_cpu(e820_table[j].address); + uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length); + uint32_t ram_type = le32_to_cpu(e820_table[j].type); + + if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) { + range_in_ram = true; + break; + } + } + if (!range_in_ram) { + return false; + } + } + + /* Find entry that contains the target range and update it */ + for (size_t i = 0; i < e820_entries; i++) { + uint64_t entry_start = le64_to_cpu(e820_table[i].address); + uint64_t entry_length = le64_to_cpu(e820_table[i].length); + uint64_t entry_end = entry_start + entry_length; + + if (entry_start <= start && entry_end >= end) { + uint32_t original_type = e820_table[i].type; + + /* Remove original entry */ + memmove(&e820_table[i], &e820_table[i + 1], + (e820_entries - i - 1) * sizeof(struct e820_entry)); + e820_entries--; + + /* Add split parts inline */ + if (entry_start < start) { + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(entry_start); + e820_table[e820_entries].length = + cpu_to_le64(start - entry_start); + e820_table[e820_entries].type = original_type; + e820_entries++; + } + + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(start); + e820_table[e820_entries].length = cpu_to_le64(length); + e820_table[e820_entries].type = cpu_to_le32(new_type); + e820_entries++; + + if (end < entry_end) { + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(end); + e820_table[e820_entries].length = cpu_to_le64(entry_end - end); + e820_table[e820_entries].type = original_type; + e820_entries++; + } + + updated = true; + break; + } + } + + return updated; +} diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h index b50acfa201..657cc679e2 100644 --- a/hw/i386/e820_memory_layout.h +++ b/hw/i386/e820_memory_layout.h @@ -15,6 +15,7 @@ #define E820_ACPI 3 #define E820_NVS 4 #define E820_UNUSABLE 5 +#define E820_SOFT_RESERVED 0xEFFFFFFF struct e820_entry { uint64_t address; @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type); bool e820_get_entry(int index, uint32_t type, uint64_t *address, uint64_t *length); int e820_get_table(struct e820_entry **table); +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type); #endif diff --git a/hw/i386/pc.c b/hw/i386/pc.c index bc048a6d13..3e50570484 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -26,6 +26,7 @@ #include "qemu/units.h" #include "exec/target_page.h" #include "hw/i386/pc.h" +#include "system/ramblock.h" #include "hw/char/serial-isa.h" #include "hw/char/parallel.h" #include "hw/hyperv/hv-balloon.h" @@ -787,6 +788,41 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) return pc_above_4g_end(pcms) - 1; } +static int pc_update_spm_memory(RAMBlock *rb, void *opaque) +{ + X86MachineState *x86ms = opaque; + MachineState *ms = MACHINE(x86ms); + ram_addr_t offset; + ram_addr_t length; + bool is_spm = false; + + /* Check if this RAM block belongs to a NUMA node with spm=on */ + for (int i = 0; i < ms->numa_state->num_nodes; i++) { + NodeInfo *numa_info = &ms->numa_state->nodes[i]; + if (numa_info->is_spm && numa_info->node_memdev) { + MemoryRegion *mr = &numa_info->node_memdev->mr; + if (mr->ram_block == rb) { + /* Mark this RAM block as SPM and set the flag */ + rb->flags |= RAM_SPM; + is_spm = true; + break; + } + } + } + + if (is_spm) { + offset = qemu_ram_get_offset(rb) + + (0x100000000ULL - x86ms->below_4g_mem_size); + length = qemu_ram_get_used_length(rb); + if (!e820_update_entry_type(offset, length, E820_SOFT_RESERVED)) { + warn_report("Failed to update E820 entry for SPM at 0x%" PRIx64 + " length 0x%" PRIx64, offset, length); + } + } + + return 0; +} + /* * AMD systems with an IOMMU have an additional hole close to the * 1Tb, which are special GPAs that cannot be DMA mapped. Depending @@ -901,6 +937,7 @@ void pc_memory_init(PCMachineState *pcms, if (pcms->sgx_epc.size != 0) { e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); } + qemu_ram_foreach_block(pc_update_spm_memory, x86ms); if (!pcmc->has_reserved_memory && (machine->ram_slots || diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 9b658a3f48..9b437eaa10 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -89,6 +89,7 @@ ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb); ram_addr_t qemu_ram_get_used_length(RAMBlock *rb); ram_addr_t qemu_ram_get_max_length(RAMBlock *rb); bool qemu_ram_is_shared(RAMBlock *rb); +bool qemu_ram_is_spm(RAMBlock *rb); bool qemu_ram_is_noreserve(RAMBlock *rb); bool qemu_ram_is_uf_zeroable(RAMBlock *rb); void qemu_ram_set_uf_zeroable(RAMBlock *rb); diff --git a/include/system/memory.h b/include/system/memory.h index aa85fc27a1..0d36cbd30d 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -275,6 +275,9 @@ typedef struct IOMMUTLBEvent { */ #define RAM_PRIVATE (1 << 13) +/* RAM is Specific Purpose Memory */ +#define RAM_SPM (1 << 14) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, diff --git a/include/system/numa.h b/include/system/numa.h index 1044b0eb6e..438511a756 100644 --- a/include/system/numa.h +++ b/include/system/numa.h @@ -41,6 +41,7 @@ typedef struct NodeInfo { bool present; bool has_cpu; bool has_gi; + bool is_spm; uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; diff --git a/qapi/machine.json b/qapi/machine.json index 038eab281c..1fa31b0224 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -500,6 +500,11 @@ # @memdev: memory backend object. If specified for one node, it must # be specified for all nodes. # +# @spm: if true, mark the memory region of this node as Specific +# Purpose Memory (SPM). This will set the RAM_SPM flag for the +# corresponding memory region and set the E820 type to +# E820_SOFT_RESERVED. (default: false, since 9.2) +# # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points # to the nodeid which has the memory controller responsible for # this NUMA node. This field provides additional information as @@ -514,6 +519,7 @@ '*cpus': ['uint16'], '*mem': 'size', '*memdev': 'str', + '*spm': 'bool', '*initiator': 'uint16' }} ## diff --git a/system/physmem.c b/system/physmem.c index ae8ecd50ea..0090d9955d 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1611,6 +1611,11 @@ bool qemu_ram_is_noreserve(RAMBlock *rb) return rb->flags & RAM_NORESERVE; } +bool qemu_ram_is_spm(RAMBlock *rb) +{ + return rb->flags & RAM_SPM; +} + /* Note: Only set at the start of postcopy */ bool qemu_ram_is_uf_zeroable(RAMBlock *rb) { @@ -2032,7 +2037,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size, ram_flags &= ~RAM_PRIVATE; /* Just support these ram flags by now. */ - assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | + assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_SPM | RAM_NORESERVE | RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY | RAM_READONLY_FD | RAM_GUEST_MEMFD | RAM_RESIZEABLE)) == 0); -- 2.34.1
