According to the NUMA topology passed via the QEMU firmware configuration interface the BIOS code generates a SRAT (System Resources Affinity Table) to describe which (V)CPU and which part of memory is assigned to a certain node. This will then be read and hopefully honored by the guest OS.

Signed-off-by: Andre Przywara <[EMAIL PROTECTED]>

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
commit 24fce48662f201903bca101e90ccca386428e764
Author: Andre Przywara <[EMAIL PROTECTED]>
Date:   Fri Dec 5 14:18:16 2008 +0100

    generate appropriate SRAT ACPI table

diff --git a/bios/rombios32.c b/bios/rombios32.c
index 3c9a2d7..878690d 100755
--- a/bios/rombios32.c
+++ b/bios/rombios32.c
@@ -455,12 +455,30 @@ void wrmsr_smp(uint32_t index, uint64_t val)
     p->ecx = 0;
 }
 
+static inline uint16_t le16_to_cpu(uint16_t x)
+{
+    return x;
+}
+
+static inline uint32_t le32_to_cpu(uint32_t x)
+{
+    return x;
+}
+
+static inline uint64_t le64_to_cpu(uint64_t x)
+{
+    return x;
+}
+
 #ifdef BX_QEMU
 #define QEMU_CFG_CTL_PORT 0x510
 #define QEMU_CFG_DATA_PORT 0x511
 #define QEMU_CFG_SIGNATURE  0x00
 #define QEMU_CFG_ID         0x01
 #define QEMU_CFG_UUID       0x02
+#define QEMU_CFG_NUMA_NODES 0x07
+#define QEMU_CFG_NUMA_VCPUS 0x08
+#define QEMU_CFG_NUMA_MEM   0x09
 
 int qemu_cfg_port;
 
@@ -488,6 +506,23 @@ void qemu_cfg_read(uint8_t *buf, int len)
     while (len--)
         *(buf++) = inb(QEMU_CFG_DATA_PORT);
 }
+
+uint32_t qemu_cfg_get32 (void)
+{
+uint32_t ret;
+
+    qemu_cfg_read ((uint8_t*)&ret, 4);
+    return le32_to_cpu (ret);
+}
+
+uint64_t qemu_cfg_get64 (void)
+{
+uint64_t ret;
+
+    qemu_cfg_read ((uint8_t*)&ret, 8);
+    return le64_to_cpu (ret);
+}
+
 #endif
 
 void uuid_probe(void)
@@ -502,6 +537,18 @@ void uuid_probe(void)
     memset(bios_uuid, 0, 16);
 }
 
+int get_numa_nodes(void)
+{
+uint16_t nodes = 0;
+#ifdef BX_QEMU
+    if(qemu_cfg_port) {
+        qemu_cfg_select(QEMU_CFG_NUMA_NODES);
+        qemu_cfg_read((uint8_t*)&nodes, 2);
+    }
+#endif
+    return le16_to_cpu(nodes);
+}
+
 void cpu_probe(void)
 {
     uint32_t eax, ebx, ecx, edx;
@@ -1232,7 +1279,7 @@ struct rsdp_descriptor         /* Root System Descriptor Pointer */
 struct rsdt_descriptor_rev1
 {
 	ACPI_TABLE_HEADER_DEF                           /* ACPI common table header */
-	uint32_t                             table_offset_entry [2]; /* Array of pointers to other */
+	uint32_t                             table_offset_entry [3]; /* Array of pointers to other */
 			 /* ACPI tables */
 };
 
@@ -1350,6 +1397,9 @@ struct multiple_apic_table
 #define APIC_XRUPT_SOURCE       8
 #define APIC_RESERVED           9           /* 9 and greater are reserved */
 
+#define SRAT_PROCESSOR          0
+#define SRAT_MEMORY             1
+
 /*
  * MADT sub-structures (Follow MULTIPLE_APIC_DESCRIPTION_TABLE)
  */
@@ -1357,6 +1407,40 @@ struct multiple_apic_table
 	uint8_t                              type; \
 	uint8_t                              length;
 
+/*
+ * SRAT (NUMA topology description) table
+ */
+struct system_resource_affinity_table
+{
+    ACPI_TABLE_HEADER_DEF
+    uint32_t    reserved1;
+    uint32_t    reserved2[2];
+};
+
+struct srat_processor_affinity
+{
+APIC_HEADER_DEF
+	uint8_t     proximity_lo;
+	uint8_t     local_apic_id;
+	uint32_t    flags;
+	uint8_t     local_sapic_eid;
+	uint8_t     proximity_hi[3];
+	uint32_t    reserved;
+};
+
+struct srat_memory_affinity
+{
+	APIC_HEADER_DEF
+	uint8_t     proximity[4];
+	uint16_t    reserved1;
+	uint32_t    base_addr_low,base_addr_high;
+	uint32_t    length_low,length_high;
+	uint32_t    reserved2;
+	uint32_t    flags;
+	uint32_t    reserved3[2];
+};
+	
+
 /* Sub-structures for MADT */
 
 struct madt_processor_apic
@@ -1411,6 +1495,26 @@ static int acpi_checksum(const uint8_t *data, int len)
     return (-sum) & 0xff;
 }
 
+static void read_config_numa_vcpus (uint32_t *nodes, int numnodes)
+{
+#ifdef BX_QEMU
+uint64_t cpumask;
+int node,cpu;
+
+    qemu_cfg_select (QEMU_CFG_NUMA_VCPUS);
+    for (node = 0; node < numnodes; node++) {
+        cpumask = qemu_cfg_get64();
+        for (cpu = 0; cpu < 64; cpu++) {
+            if (cpumask == 0) break;
+            if (cpumask & 1) nodes[cpu]=node;
+            cpumask >>= 1;
+        }
+    }
+#endif
+    return;
+
+}
+
 static void acpi_build_table_header(struct acpi_table_header *h,
                                     char *sig, int len, uint8_t rev)
 {
@@ -1435,6 +1539,21 @@ static void acpi_build_table_header(struct acpi_table_header *h,
     h->checksum = acpi_checksum((void *)h, len);
 }
 
+static void acpi_build_srat_memory(struct srat_memory_affinity *numamem,
+    uint64_t base, uint64_t len, int node, int enabled)
+{
+    numamem->type = SRAT_MEMORY;
+    numamem->length = sizeof(*numamem);
+    memset (numamem->proximity, 0 ,4);
+    numamem->proximity[0] = node;
+    numamem->flags = cpu_to_le32(!!enabled);
+    numamem->base_addr_low = base & 0xFFFFFFFF;
+    numamem->base_addr_high = base >> 32;
+    numamem->length_low = len & 0xFFFFFFFF;
+    numamem->length_high = len >> 32;
+    return;
+}
+
 /* base_addr must be a multiple of 4KB */
 void acpi_bios_init(void)
 {
@@ -1443,10 +1562,12 @@ void acpi_bios_init(void)
     struct fadt_descriptor_rev1 *fadt;
     struct facs_descriptor_rev1 *facs;
     struct multiple_apic_table *madt;
+    struct system_resource_affinity_table *srat;
     uint8_t *dsdt;
     uint32_t base_addr, rsdt_addr, fadt_addr, addr, facs_addr, dsdt_addr;
     uint32_t acpi_tables_size, madt_addr, madt_size;
-    int i;
+    uint32_t srat_addr, srat_size;
+    int i, numanodes;
 
     /* reserve memory space for tables */
 #ifdef BX_USE_EBDA_TABLES
@@ -1478,6 +1599,21 @@ void acpi_bios_init(void)
     dsdt = (void *)(addr);
     addr += sizeof(AmlCode);
 
+    numanodes = get_numa_nodes();
+    if (numanodes > 0) {
+        addr = (addr + 7) & ~7;
+        srat_addr = addr;
+        srat_size = sizeof(*srat) +
+            sizeof(struct srat_processor_affinity) * smp_cpus +
+            sizeof(struct srat_memory_affinity) * (numanodes + 2);
+        srat = (void *)(addr);
+        addr += srat_size;
+    } else {
+        srat_addr = addr;
+        srat = (void*)(addr);
+        srat_size = 0;
+    }
+
     addr = (addr + 7) & ~7;
     madt_addr = addr;
     madt_size = sizeof(*madt) +
@@ -1507,8 +1643,10 @@ void acpi_bios_init(void)
     memset(rsdt, 0, sizeof(*rsdt));
     rsdt->table_offset_entry[0] = cpu_to_le32(fadt_addr);
     rsdt->table_offset_entry[1] = cpu_to_le32(madt_addr);
-    acpi_build_table_header((struct acpi_table_header *)rsdt,
-                            "RSDT", sizeof(*rsdt), 1);
+    if (numanodes > 0)
+        rsdt->table_offset_entry[2] = cpu_to_le32(srat_addr);
+    acpi_build_table_header((struct acpi_table_header *)rsdt, "RSDT",
+        sizeof(*rsdt) - (numanodes > 0? 0: sizeof(uint32_t)), 1);
 
     /* FADT */
     memset(fadt, 0, sizeof(*fadt));
@@ -1590,6 +1728,67 @@ void acpi_bios_init(void)
         acpi_build_table_header((struct acpi_table_header *)madt,
                                 "APIC", madt_size, 1);
     }
+
+    /* SRAT */
+    if (numanodes > 0) {
+        struct srat_processor_affinity *core;
+        struct srat_memory_affinity *numamem;
+        int slots;
+        uint64_t mem_len, mem_base, next_base = 0;
+        uint32_t nodes[64];
+
+        memset (srat, 0 , srat_size);
+        srat->reserved1=1;
+
+        read_config_numa_vcpus (nodes, numanodes);
+        core = (void*)(srat + 1);
+        for (i = 0; i < smp_cpus; ++i) {
+            core->type = SRAT_PROCESSOR;
+            core->length = sizeof(*core);
+            core->local_apic_id = i;
+            core->proximity_lo = nodes[i];
+            memset (core->proximity_hi, 0, 3);
+            core->local_sapic_eid = 0;
+            if (i < smp_cpus)
+                core->flags = cpu_to_le32(1);
+            else
+                core->flags = 0;
+            core++;
+        }
+        /* the memory map is a bit tricky, it contains at least one hole
+           from 640k-1M and possibly another one from 3.5G-4G. */
+        numamem = (void*)core; slots = 0;
+        qemu_cfg_select (QEMU_CFG_NUMA_MEM);
+        acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1);
+        next_base = 1024 * 1024; numamem++;slots++;
+        for (i = 1; i < numanodes + 1; ++i) {
+            mem_base = next_base;
+            mem_len = qemu_cfg_get64();
+            if (i == 1) mem_len -= 1024 * 1024;
+            next_base = mem_base + mem_len;
+
+            /* Cut out the PCI hole */
+            if (mem_base <= ram_size && next_base > ram_size) {
+                mem_len -= next_base - ram_size;
+                if (mem_len > 0) {
+                    acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+                    numamem++; slots++;
+                }
+                mem_base = 1ULL << 32;
+                mem_len = next_base - ram_size;
+                next_base += (1ULL << 32) - ram_size;
+            }
+            acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1);
+            numamem++; slots++;
+        }
+        for (; slots < numanodes + 2; slots++) {
+            acpi_build_srat_memory(numamem, 0, 0, 0, 0);
+            numamem++;
+        }
+
+        acpi_build_table_header((struct acpi_table_header *)srat,
+                                "SRAT", srat_size, 1);
+    }
 }
 
 /* SMBIOS entry point -- must be written to a 16-bit aligned address

Reply via email to