Add a 'memmap-type' option to NUMA node configuration that allows specifying the memory type for a NUMA node.
Supported values: - normal: Regular system RAM (E820 type 1, default) - spm: Specific Purpose Memory (E820 type 0xEFFFFFFF) - reserved: Reserved memory (E820 type 2) The 'spm' type indicates Specific Purpose Memory - a hint to the guest that this memory might be managed by device drivers based on guest policy. The 'reserved' type marks memory as not usable as RAM. Note: This option is only supported on x86 platforms. Usage: -numa node,nodeid=1,memdev=m1,memmap-type=spm Signed-off-by: fanhuang <[email protected]> --- hw/core/numa.c | 19 ++++++++++ hw/i386/e820_memory_layout.c | 72 ++++++++++++++++++++++++++++++++++++ hw/i386/e820_memory_layout.h | 12 +++--- hw/i386/pc.c | 61 ++++++++++++++++++++++++++++++ include/system/numa.h | 7 ++++ qapi/machine.json | 24 ++++++++++++ qemu-options.hx | 14 ++++++- 7 files changed, 202 insertions(+), 7 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index f462883c87..409b2e2bb9 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -38,6 +38,7 @@ #include "hw/mem/pc-dimm.h" #include "hw/core/boards.h" #include "hw/mem/memory-device.h" +#include "hw/i386/x86.h" #include "qemu/option.h" #include "qemu/config-file.h" #include "qemu/cutils.h" @@ -164,6 +165,24 @@ static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); } + if (node->has_memmap_type && node->memmap_type != NUMA_MEMMAP_TYPE_NORMAL) { + if (!object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) { + error_setg(errp, "memmap-type=%s is only supported on x86 machines", + NumaMemmapType_str(node->memmap_type)); + return; + } + switch (node->memmap_type) { + case NUMA_MEMMAP_TYPE_SPM: + numa_info[nodenr].memmap_type = NUMA_MEMMAP_SPM; + break; + case NUMA_MEMMAP_TYPE_RESERVED: + numa_info[nodenr].memmap_type = NUMA_MEMMAP_RESERVED; + break; + default: + break; + } + } + numa_info[nodenr].present = true; max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); ms->numa_state->num_nodes++; diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c index 3e848fb69c..4c62b5ddea 100644 --- a/hw/i386/e820_memory_layout.c +++ b/hw/i386/e820_memory_layout.c @@ -46,3 +46,75 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t *address, uint64_t *length) } return false; } + +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type) +{ + uint64_t end = start + length; + assert(!e820_done); + + /* For E820_SOFT_RESERVED, validate range is within E820_RAM */ + if (new_type == E820_SOFT_RESERVED) { + bool range_in_ram = false; + + for (size_t j = 0; j < e820_entries; j++) { + uint64_t ram_start = le64_to_cpu(e820_table[j].address); + uint64_t ram_end = ram_start + le64_to_cpu(e820_table[j].length); + uint32_t ram_type = le32_to_cpu(e820_table[j].type); + + if (ram_type == E820_RAM && ram_start <= start && ram_end >= end) { + range_in_ram = true; + break; + } + } + if (!range_in_ram) { + return false; + } + } + + /* Find entry that contains the target range and update it */ + for (size_t i = 0; i < e820_entries; i++) { + uint64_t entry_start = le64_to_cpu(e820_table[i].address); + uint64_t entry_length = le64_to_cpu(e820_table[i].length); + uint64_t entry_end = entry_start + entry_length; + + if (entry_start <= start && entry_end >= end) { + uint32_t original_type = e820_table[i].type; + + /* Remove original entry */ + memmove(&e820_table[i], &e820_table[i + 1], + (e820_entries - i - 1) * sizeof(struct e820_entry)); + e820_entries--; + + /* Add split parts inline */ + if (entry_start < start) { + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(entry_start); + e820_table[e820_entries].length = + cpu_to_le64(start - entry_start); + e820_table[e820_entries].type = original_type; + e820_entries++; + } + + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(start); + e820_table[e820_entries].length = cpu_to_le64(length); + e820_table[e820_entries].type = cpu_to_le32(new_type); + e820_entries++; + + if (end < entry_end) { + e820_table = g_renew(struct e820_entry, e820_table, + e820_entries + 1); + e820_table[e820_entries].address = cpu_to_le64(end); + e820_table[e820_entries].length = cpu_to_le64(entry_end - end); + e820_table[e820_entries].type = original_type; + e820_entries++; + } + + return true; + } + } + + return false; +} diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h index b50acfa201..a85b4fd14c 100644 --- a/hw/i386/e820_memory_layout.h +++ b/hw/i386/e820_memory_layout.h @@ -10,11 +10,12 @@ #define HW_I386_E820_MEMORY_LAYOUT_H /* e820 types */ -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 -#define E820_UNUSABLE 5 +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 +#define E820_UNUSABLE 5 +#define E820_SOFT_RESERVED 0xEFFFFFFF struct e820_entry { uint64_t address; @@ -26,5 +27,6 @@ void e820_add_entry(uint64_t address, uint64_t length, uint32_t type); bool e820_get_entry(int index, uint32_t type, uint64_t *address, uint64_t *length); int e820_get_table(struct e820_entry **table); +bool e820_update_entry_type(uint64_t start, uint64_t length, uint32_t new_type); #endif diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 5cb074c0a0..d2230966f9 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -794,6 +794,64 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) return pc_above_4g_end(pcms) - 1; } +/* + * Update E820 entries for NUMA nodes with non-default memory types. + */ +static void pc_update_numa_memory_types(X86MachineState *x86ms) +{ + MachineState *ms = MACHINE(x86ms); + uint64_t addr = 0; + + for (int i = 0; i < ms->numa_state->num_nodes; i++) { + NodeInfo *numa_info = &ms->numa_state->nodes[i]; + uint64_t node_size = numa_info->node_mem; + + /* Process non-normal memory types */ + if (numa_info->memmap_type != NUMA_MEMMAP_NORMAL && + numa_info->node_memdev) { + uint64_t guest_addr; + uint32_t e820_type; + + switch (numa_info->memmap_type) { + case NUMA_MEMMAP_SPM: + e820_type = E820_SOFT_RESERVED; + break; + case NUMA_MEMMAP_RESERVED: + e820_type = E820_RESERVED; + break; + default: + goto next; + } + + /* Calculate guest physical address accounting for PCI hole */ + if (addr < x86ms->below_4g_mem_size) { + if (addr + node_size <= x86ms->below_4g_mem_size) { + guest_addr = addr; + } else { + error_report("NUMA node %d with memmap-type spans across " + "4GB boundary, not supported", i); + exit(EXIT_FAILURE); + } + } else { + guest_addr = 0x100000000ULL + + (addr - x86ms->below_4g_mem_size); + } + + if (!e820_update_entry_type(guest_addr, node_size, e820_type)) { + warn_report("Failed to update E820 entry for node %d " + "at 0x%" PRIx64 " length 0x%" PRIx64, + i, guest_addr, node_size); + } + } + +next: + /* Accumulate address for next node */ + if (numa_info->node_memdev) { + addr += node_size; + } + } +} + /* * AMD systems with an IOMMU have an additional hole close to the * 1Tb, which are special GPAs that cannot be DMA mapped. Depending @@ -910,6 +968,9 @@ void pc_memory_init(PCMachineState *pcms, e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); } + /* Update E820 for NUMA nodes with special memory types */ + pc_update_numa_memory_types(x86ms); + if (!pcmc->has_reserved_memory && (machine->ram_slots || (machine->maxram_size > machine->ram_size))) { diff --git a/include/system/numa.h b/include/system/numa.h index 1044b0eb6e..64e8f63736 100644 --- a/include/system/numa.h +++ b/include/system/numa.h @@ -35,12 +35,19 @@ enum { #define UINT16_BITS 16 +typedef enum { + NUMA_MEMMAP_NORMAL = 0, + NUMA_MEMMAP_SPM, + NUMA_MEMMAP_RESERVED, +} NumaMemmapTypeInternal; + typedef struct NodeInfo { uint64_t node_mem; struct HostMemoryBackend *node_memdev; bool present; bool has_cpu; bool has_gi; + NumaMemmapTypeInternal memmap_type; uint8_t lb_info_provided; uint16_t initiator; uint8_t distance[MAX_NODES]; diff --git a/qapi/machine.json b/qapi/machine.json index 907cb25f75..b7fc8c564f 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -464,6 +464,22 @@ { 'enum': 'NumaOptionsType', 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } +## +# @NumaMemmapType: +# +# Memory mapping type for a NUMA node. +# +# @normal: Normal system RAM (E820 type 1) +# +# @spm: Specific Purpose Memory (E820 type 0xEFFFFFFF) +# +# @reserved: Reserved memory (E820 type 2) +# +# Since: 10.2 +## +{ 'enum': 'NumaMemmapType', + 'data': ['normal', 'spm', 'reserved'] } + ## # @NumaOptions: # @@ -500,6 +516,13 @@ # @memdev: memory backend object. If specified for one node, it must # be specified for all nodes. # +# @memmap-type: specifies the memory type for this NUMA node. +# 'normal' (default) is regular system RAM. +# 'spm' is Specific Purpose Memory - a hint to the guest that +# this memory might be managed by device drivers based on policy. +# 'reserved' is reserved memory, not usable as RAM. +# Currently only supported on x86. (since 10.2) +# # @initiator: defined in ACPI 6.3 Chapter 5.2.27.3 Table 5-145, points # to the nodeid which has the memory controller responsible for # this NUMA node. This field provides additional information as @@ -514,6 +537,7 @@ '*cpus': ['uint16'], '*mem': 'size', '*memdev': 'str', + '*memmap-type': 'NumaMemmapType', '*initiator': 'uint16' }} ## diff --git a/qemu-options.hx b/qemu-options.hx index ec92723f10..4da17cbefb 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -433,7 +433,7 @@ ERST DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node][,memmap-type=normal|spm|reserved]\n" "-numa dist,src=source,dst=destination,val=distance\n" "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n" "-numa hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n" @@ -442,7 +442,7 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, SRST ``-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]`` \ -``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]`` +``-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator][,memmap-type=type]`` \ ``-numa dist,src=source,dst=destination,val=distance`` \ @@ -510,6 +510,16 @@ SRST largest bandwidth) to this NUMA node. Note that this option can be set only when the machine property 'hmat' is set to 'on'. + '\ ``memmap-type``\ ' specifies the memory type for this NUMA node: + + - ``normal`` (default): Regular system RAM (E820 type 1) + - ``spm``: Specific Purpose Memory (E820 type 0xEFFFFFFF). This is a + hint to the guest that the memory might be managed by device drivers + based on guest policy. + - ``reserved``: Reserved memory (E820 type 2), not usable as RAM. + + This option is only supported on x86 platforms. + Following example creates a machine with 2 NUMA nodes, node 0 has CPU. node 1 has only memory, and its initiator is node 0. Note that because node 0 has CPU, by default the initiator of node 0 is itself -- 2.34.1
