On Thu, Jan 08, 2015 at 11:40:19AM +0530, Bharata B Rao wrote: > Parse ibm,architecture.vec table obtained from the guest and enable > memory node configuration via ibm,dynamic-reconfiguration-memory if guest > supports it. This is in preparation to support memory hotplug for > sPAPR guests. > > This changes the way memory node configuration is done. Currently all > memory nodes are built upfront. But after this patch, only memory@0 node > for RMA is built upfront. Guest kernel boots with just that and rest of > the memory nodes (via memory@XXX or ibm,dynamic-reconfiguration-memory) > are built when guest does ibm,client-architecture-support call. > > Note: This patch was tested with an enhancement to SLOF that supports > addition of device tree nodes from ibm,client-architecture-support call. > > TODO: Enforce lmb-size alignment for node memory.
I think this todo needs to be done before you're ready to merge. > Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com> > --- > hw/ppc/spapr.c | 232 > ++++++++++++++++++++++++++++++++++++++++--------- > hw/ppc/spapr_hcall.c | 51 +++++++++-- > include/hw/ppc/spapr.h | 12 ++- > 3 files changed, 246 insertions(+), 49 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 9ff08ff..6964b06 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -631,42 +631,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, > return fdt; > } > > -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size) > -{ > - void *fdt, *fdt_skel; > - sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 }; > - > - size -= sizeof(hdr); > - > - /* Create sceleton */ > - fdt_skel = g_malloc0(size); > - _FDT((fdt_create(fdt_skel, size))); > - _FDT((fdt_begin_node(fdt_skel, ""))); > - _FDT((fdt_end_node(fdt_skel))); > - _FDT((fdt_finish(fdt_skel))); > - fdt = g_malloc0(size); > - _FDT((fdt_open_into(fdt_skel, fdt, size))); > - g_free(fdt_skel); > - > - /* Fix skeleton up */ > - _FDT((spapr_fixup_cpu_dt(fdt, spapr))); > - > - /* Pack resulting tree */ > - _FDT((fdt_pack(fdt))); > - > - if (fdt_totalsize(fdt) + sizeof(hdr) > size) { > - trace_spapr_cas_failed(size); > - return -1; > - } > - > - cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); > - cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); > - trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); > - g_free(fdt); > - > - return 0; > -} > - > static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start, > hwaddr size) > { > @@ -720,7 +684,6 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, > void *fdt) > } > if (!mem_start) { > /* ppc_spapr_init() checks for rma_size <= node0_size already */ > - spapr_populate_memory_node(fdt, i, 0, spapr->rma_size); > mem_start += spapr->rma_size; > node_size -= spapr->rma_size; > } > @@ -741,6 +704,190 @@ static int spapr_populate_memory(sPAPREnvironment > *spapr, void *fdt) > return 0; > } > > +/* > + * TODO: Take care of sparsemem configuration ? > + */ > +static uint64_t numa_node_end(uint32_t nodeid) > +{ > + uint32_t i = 0; > + uint64_t addr = 0; > + > + do { > + addr += numa_info[i].node_mem; > + } while (++i <= nodeid); > + > + return addr; > +} > + > +static uint64_t numa_node_start(uint32_t nodeid) > +{ > + if (!nodeid) { > + return 0; > + } else { > + return numa_node_end(nodeid - 1); > + } > +} There's really no better generic way of finding the addresses of numa nodes? > +/* > + * Given the addr, return the NUMA node to which the address belongs to. > + */ > +static uint32_t get_numa_node(uint64_t addr) > +{ > + uint32_t i; > + > + for (i = 0; i < nb_numa_nodes; i++) { > + if ((addr >= numa_node_start(i)) && (addr < numa_node_end(i))) { This is O(nb_numa_nodes^2) which is kind of nasty, althoguh that's unlikely to be large enough to be a real problem. > + return i; > + } > + } > + > + /* Unassigned memory goes to node 0 by default */ > + return 0; > +} > + > +/* Adds ibm,dynamic-reconfiguration-memory node */ > +static int spapr_populate_drconf_memory(sPAPREnvironment *spapr, void *fdt) > +{ > + int root_offset, ret, i, offset; > + uint32_t lmb_size = SPAPR_MIN_MEMORY_BLOCK_SIZE; > + uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)}; > + uint32_t dynamic_memory[DR_LMB_LIST_ENTRY_SIZE]; > + uint32_t nr_rma_lmbs = spapr->rma_size/lmb_size; > + uint32_t nr_lmbs = spapr->maxram_limit/lmb_size - nr_rma_lmbs; > + uint32_t nr_assigned_lmbs = spapr->ram_limit/lmb_size - nr_rma_lmbs; > + uint32_t *int_buf, *cur_index, buf_len; > + > + /* Allocate enough buffer size to fit in ibm,dynamic-memory */ > + buf_len = nr_lmbs * DR_LMB_LIST_ENTRY_SIZE * sizeof(uint32_t) + > + sizeof(uint32_t); > + cur_index = int_buf = g_malloc0(buf_len); > + root_offset = fdt_path_offset(fdt, "/"); You don't need this, the fdt offset of / is always 0 and you're allowed to count on that. > + > + > + offset = fdt_add_subnode(fdt, root_offset, > + "ibm,dynamic-reconfiguration-memory"); > + > + ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size, > + sizeof(prop_lmb_size)); > + if (ret < 0) { > + goto out; > + } Current versions of libfdt have fdt_setprop_u64 which you could use for this. > + ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", > + cpu_to_be32(0xff)); fdt_setprop_cell has the byteswap built-in, so adding your own as well will make it incorrect for an LE host. > + if (ret < 0) { > + goto out; > + } > + > + ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", > + cpu_to_be32(0x0)); > + if (ret < 0) { > + goto out; > + } > + > + /* ibm,dynamic-memory */ > + int_buf[0] = cpu_to_be32(nr_lmbs); > + cur_index++; > + for (i = 0; i < nr_lmbs; i++) { > + sPAPRDRConnector *drc; > + sPAPRDRConnectorClass *drck; > + uint64_t addr; > + > + if (i < nr_assigned_lmbs) { > + addr = (i + nr_rma_lmbs) * lmb_size; > + } else { > + addr = (i - nr_assigned_lmbs) * lmb_size + > + SPAPR_MACHINE(qdev_get_machine())->hotplug_memory_base; > + } > + drc = spapr_dr_connector_new(qdev_get_machine(), > + SPAPR_DR_CONNECTOR_TYPE_LMB, addr/lmb_size); > + drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); > + > + dynamic_memory[0] = cpu_to_be32(addr >> 32); > + dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff); > + dynamic_memory[2] = cpu_to_be32(drck->get_index(drc)); > + dynamic_memory[3] = cpu_to_be32(0); /* reserved */ > + dynamic_memory[4] = cpu_to_be32(get_numa_node(addr)); As noted above your current get_numa_node() implementation is O(N^2) making this routine O(N^3). > + dynamic_memory[5] = (addr < spapr->ram_limit) ? > + cpu_to_be32(LMB_FLAGS_ASSIGNED) : > + cpu_to_be32(0); > + > + memcpy(cur_index, dynamic_memory, sizeof(dynamic_memory)); You could just use uint32_t *dynamic_memory = cur_index at the beginning of the loop block to avoid this memcpy(). > + cur_index += DR_LMB_LIST_ENTRY_SIZE; > + } > + ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len); > + if (ret < 0) { > + goto out; > + } > + > + /* ibm,associativity-lookup-arrays */ > + cur_index = int_buf; > + int_buf[0] = cpu_to_be32(nb_numa_nodes); > + int_buf[1] = cpu_to_be32(4); Something explaining the significance of this 4 for those of us that don't have access to PAPR+ would be nice. > + cur_index += 2; > + for (i = 0; i < nb_numa_nodes; i++) { > + uint32_t associativity[] = { > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(0x0), > + cpu_to_be32(i) > + }; > + memcpy(cur_index, associativity, sizeof(associativity)); > + cur_index += 4; > + } > + ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", > int_buf, > + (cur_index - int_buf) * sizeof(uint32_t)); > +out: > + g_free(int_buf); > + return ret; > +} > + > +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size, > + bool cpu_update, bool memory_update) > +{ > + void *fdt, *fdt_skel; > + sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 }; > + > + size -= sizeof(hdr); > + > + /* Create sceleton */ > + fdt_skel = g_malloc0(size); > + _FDT((fdt_create(fdt_skel, size))); > + _FDT((fdt_begin_node(fdt_skel, ""))); > + _FDT((fdt_end_node(fdt_skel))); > + _FDT((fdt_finish(fdt_skel))); > + fdt = g_malloc0(size); > + _FDT((fdt_open_into(fdt_skel, fdt, size))); > + g_free(fdt_skel); > + > + /* Fixup cpu nodes */ > + if (cpu_update) { > + _FDT((spapr_fixup_cpu_dt(fdt, spapr))); > + } > + > + /* Generate memory nodes or ibm,dynamic-reconfiguration-memory node */ > + if (memory_update) { > + _FDT((spapr_populate_drconf_memory(spapr, fdt))); > + } else { > + _FDT((spapr_populate_memory(spapr, fdt))); > + } > + > + /* Pack resulting tree */ > + _FDT((fdt_pack(fdt))); > + > + if (fdt_totalsize(fdt) + sizeof(hdr) > size) { You could just set the correct size (size - sizeof(hdr)) to fdt_create() and fdt_open_into() and avoid this failure case. > + trace_spapr_cas_failed(size); > + return -1; > + } > + > + cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); > + cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); > + trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); > + g_free(fdt); > + > + return 0; > +} > + > static void spapr_finalize_fdt(sPAPREnvironment *spapr, > hwaddr fdt_addr, > hwaddr rtas_addr, > @@ -757,11 +904,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr, > /* open out the base tree into a temp buffer for the final tweaks */ > _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE))); > > - ret = spapr_populate_memory(spapr, fdt); > - if (ret < 0) { > - fprintf(stderr, "couldn't setup memory nodes in fdt\n"); > - exit(1); > - } > + /* > + * Add memory@0 node to represent RMA. Rest of the memory is either > + * represented by memory nodes or ibm,dynamic-reconfiguration-memory > + * node later during ibm,client-architecture-support call. > + */ > + spapr_populate_memory_node(fdt, 0, 0, spapr->rma_size); > > ret = spapr_populate_vdevice(spapr->vio_bus, fdt); > if (ret < 0) { > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 8651447..10f05f4 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -805,6 +805,32 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > return ret; > } > > +/* > + * Return the offset to the requested option vector @vector in the > + * option vector table @table. > + */ > +static target_ulong cas_get_option_vector(int vector, target_ulong table) > +{ > + int i; > + char nr_vectors, nr_entries; > + > + if (!table) { > + return 0; > + } > + > + nr_vectors = (rtas_ld(table, 0) >> 24) + 1; This is kind of abusing the rtas_ld() function. It's really only intended for accessing rtas arguments, not an arbitrary array of u32s. > + if (!vector || vector > nr_vectors) { > + return 0; > + } > + table++; /* skip nr option vectors */ > + > + for (i = 0; i < vector - 1; i++) { > + nr_entries = rtas_ld(table, 0) >> 24; > + table += nr_entries + 2; > + } > + return table; > +} > + > typedef struct { > PowerPCCPU *cpu; > uint32_t cpu_version; > @@ -825,19 +851,22 @@ static void do_set_compat(void *arg) > ((cpuver) == CPU_POWERPC_LOGICAL_2_06_PLUS) ? 2061 : \ > ((cpuver) == CPU_POWERPC_LOGICAL_2_07) ? 2070 : 0) > > +#define OV5_DRCONF_MEMORY 0x20 > + > static target_ulong h_client_architecture_support(PowerPCCPU *cpu_, > sPAPREnvironment *spapr, > target_ulong opcode, > target_ulong *args) > { > - target_ulong list = args[0]; > + target_ulong list = args[0], ov_table; > PowerPCCPUClass *pcc_ = POWERPC_CPU_GET_CLASS(cpu_); > CPUState *cs; > - bool cpu_match = false; > + bool cpu_match = false, cpu_update = true, memory_update = false; > unsigned old_cpu_version = cpu_->cpu_version; > unsigned compat_lvl = 0, cpu_version = 0; > unsigned max_lvl = get_compat_level(cpu_->max_compat); > int counter; > + char ov5_byte2; > > /* Parse PVR list */ > for (counter = 0; counter < 512; ++counter) { > @@ -887,8 +916,6 @@ static target_ulong > h_client_architecture_support(PowerPCCPU *cpu_, > } > } > > - /* For the future use: here @list points to the first capability */ > - > /* Parsing finished */ > trace_spapr_cas_pvr(cpu_->cpu_version, cpu_match, > cpu_version, pcc_->pcr_mask); > @@ -912,14 +939,26 @@ static target_ulong > h_client_architecture_support(PowerPCCPU *cpu_, > } > > if (!cpu_version) { > - return H_SUCCESS; > + cpu_update = false; > } > > + /* For the future use: here @ov_table points to the first option vector > */ > + ov_table = list; > + > + list = cas_get_option_vector(5, ov_table); What's the literal 5 mean? > if (!list) { > return H_SUCCESS; > } > > - if (spapr_h_cas_compose_response(args[1], args[2])) { > + /* @list now points to OV 5 */ > + list += 2; > + ov5_byte2 = rtas_ld(list, 0) >> 24; > + if (ov5_byte2 & OV5_DRCONF_MEMORY) { > + memory_update = true; > + } > + > + if (spapr_h_cas_compose_response(args[1], args[2], cpu_update, > + memory_update)) { > qemu_system_reset_request(); > } > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 64681c4..10283f9 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -485,9 +485,19 @@ struct sPAPRTCETable { > /* Support a min of 1TB hotplug memory assuming 256MB per slot */ > #define SPAPR_MAX_RAM_SLOTS (1ULL << 12) > > +/* > + * Number of 32 bit words in each LMB list entry in ibm,dynamic-memory > + * property under ibm,dynamic-reconfiguration-memory node. > + */ > +#define DR_LMB_LIST_ENTRY_SIZE 6 > + > +/* Flag values in Option Vector 5 ibm architecture vector table. */ > +#define LMB_FLAGS_ASSIGNED 0x00000008 Things declared in a public header should have some sort of namespacing, so, SPAPR_DR_LBM_LIST_ENTRY_SIZE for example. > void spapr_events_init(sPAPREnvironment *spapr); > void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq); > -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size); > +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size, > + bool cpu_update, bool memory_update); > sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, > uint64_t bus_offset, > uint32_t page_shift, -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
pgpv6Inyv7key.pgp
Description: PGP signature