Re: [Qemu-devel] [RFC PATCH v1 12/13] spapr: Support ibm, dynamic-reconfiguration-memory

David Gibson Wed, 11 Feb 2015 22:11:43 -0800

On Thu, Jan 08, 2015 at 11:40:19AM +0530, Bharata B Rao wrote:
> Parse ibm,architecture.vec table obtained from the guest and enable
> memory node configuration via ibm,dynamic-reconfiguration-memory if guest
> supports it. This is in preparation to support memory hotplug for
> sPAPR guests.
> 
> This changes the way memory node configuration is done. Currently all
> memory nodes are built upfront. But after this patch, only memory@0 node
> for RMA is built upfront. Guest kernel boots with just that and rest of
> the memory nodes (via memory@XXX or ibm,dynamic-reconfiguration-memory)
> are built when guest does ibm,client-architecture-support call.
> 
> Note: This patch was tested with an enhancement to SLOF that supports
> addition of device tree nodes from ibm,client-architecture-support call.
> 
> TODO: Enforce lmb-size alignment for node memory.


I think this todo needs to be done before you're ready to merge.

> Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         | 232 
> ++++++++++++++++++++++++++++++++++++++++---------
>  hw/ppc/spapr_hcall.c   |  51 +++++++++--
>  include/hw/ppc/spapr.h |  12 ++-
>  3 files changed, 246 insertions(+), 49 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 9ff08ff..6964b06 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -631,42 +631,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
>      return fdt;
>  }
>  
> -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
> -{
> -    void *fdt, *fdt_skel;
> -    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
> -
> -    size -= sizeof(hdr);
> -
> -    /* Create sceleton */
> -    fdt_skel = g_malloc0(size);
> -    _FDT((fdt_create(fdt_skel, size)));
> -    _FDT((fdt_begin_node(fdt_skel, "")));
> -    _FDT((fdt_end_node(fdt_skel)));
> -    _FDT((fdt_finish(fdt_skel)));
> -    fdt = g_malloc0(size);
> -    _FDT((fdt_open_into(fdt_skel, fdt, size)));
> -    g_free(fdt_skel);
> -
> -    /* Fix skeleton up */
> -    _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
> -
> -    /* Pack resulting tree */
> -    _FDT((fdt_pack(fdt)));
> -
> -    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
> -        trace_spapr_cas_failed(size);
> -        return -1;
> -    }
> -
> -    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
> -    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
> -    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
> -    g_free(fdt);
> -
> -    return 0;
> -}
> -
>  static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start,
>                                         hwaddr size)
>  {
> @@ -720,7 +684,6 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, 
> void *fdt)
>          }
>          if (!mem_start) {
>              /* ppc_spapr_init() checks for rma_size <= node0_size already */
> -            spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
>              mem_start += spapr->rma_size;
>              node_size -= spapr->rma_size;
>          }
> @@ -741,6 +704,190 @@ static int spapr_populate_memory(sPAPREnvironment 
> *spapr, void *fdt)
>      return 0;
>  }
>  
> +/*
> + * TODO: Take care of sparsemem configuration ?
> + */
> +static uint64_t numa_node_end(uint32_t nodeid)
> +{
> +    uint32_t i = 0;
> +    uint64_t addr = 0;
> +
> +    do {
> +        addr += numa_info[i].node_mem;
> +    } while (++i <= nodeid);
> +
> +    return addr;
> +}
> +
> +static uint64_t numa_node_start(uint32_t nodeid)
> +{
> +    if (!nodeid) {
> +        return 0;
> +    } else {
> +        return numa_node_end(nodeid - 1);
> +    }
> +}

There's really no better generic way of finding the addresses of numa nodes?

> +/*
> + * Given the addr, return the NUMA node to which the address belongs to.
> + */
> +static uint32_t get_numa_node(uint64_t addr)
> +{
> +    uint32_t i;
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if ((addr >= numa_node_start(i)) && (addr < numa_node_end(i))) {

This is O(nb_numa_nodes^2) which is kind of nasty, althoguh that's
unlikely to be large enough to be a real problem.

> +            return i;
> +        }
> +    }
> +
> +    /* Unassigned memory goes to node 0 by default */
> +    return 0;
> +}
> +
> +/* Adds ibm,dynamic-reconfiguration-memory node */
> +static int spapr_populate_drconf_memory(sPAPREnvironment *spapr, void *fdt)
> +{
> +    int root_offset, ret, i, offset;
> +    uint32_t lmb_size = SPAPR_MIN_MEMORY_BLOCK_SIZE;
> +    uint32_t prop_lmb_size[] = {0, cpu_to_be32(lmb_size)};
> +    uint32_t dynamic_memory[DR_LMB_LIST_ENTRY_SIZE];
> +    uint32_t nr_rma_lmbs = spapr->rma_size/lmb_size;
> +    uint32_t nr_lmbs = spapr->maxram_limit/lmb_size - nr_rma_lmbs;
> +    uint32_t nr_assigned_lmbs = spapr->ram_limit/lmb_size - nr_rma_lmbs;
> +    uint32_t *int_buf, *cur_index, buf_len;
> +
> +    /* Allocate enough buffer size to fit in ibm,dynamic-memory */
> +    buf_len = nr_lmbs * DR_LMB_LIST_ENTRY_SIZE * sizeof(uint32_t) +
> +                sizeof(uint32_t);
> +    cur_index = int_buf = g_malloc0(buf_len);
> +    root_offset = fdt_path_offset(fdt, "/");

You don't need this, the fdt offset of / is always 0 and you're
allowed to count on that.

> +
> +
> +    offset = fdt_add_subnode(fdt, root_offset,
> +                   "ibm,dynamic-reconfiguration-memory");
> +
> +    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
> +            sizeof(prop_lmb_size));
> +    if (ret < 0) {
> +        goto out;
> +    }

Current versions of libfdt have fdt_setprop_u64 which you could use
for this.

> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask",
> +                            cpu_to_be32(0xff));

fdt_setprop_cell has the byteswap built-in, so adding your own as well
will make it incorrect for an LE host.

> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time",
> +                            cpu_to_be32(0x0));
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* ibm,dynamic-memory */
> +    int_buf[0] = cpu_to_be32(nr_lmbs);
> +    cur_index++;
> +    for (i = 0; i < nr_lmbs; i++) {
> +        sPAPRDRConnector *drc;
> +        sPAPRDRConnectorClass *drck;
> +        uint64_t addr;
> +
> +        if (i < nr_assigned_lmbs) {
> +            addr = (i + nr_rma_lmbs) * lmb_size;
> +        } else {
> +            addr = (i - nr_assigned_lmbs) * lmb_size +
> +                SPAPR_MACHINE(qdev_get_machine())->hotplug_memory_base;
> +        }
> +        drc = spapr_dr_connector_new(qdev_get_machine(),
> +                SPAPR_DR_CONNECTOR_TYPE_LMB, addr/lmb_size);
> +        drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +
> +        dynamic_memory[0] = cpu_to_be32(addr >> 32);
> +        dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
> +        dynamic_memory[2] = cpu_to_be32(drck->get_index(drc));
> +        dynamic_memory[3] = cpu_to_be32(0); /* reserved */
> +        dynamic_memory[4] = cpu_to_be32(get_numa_node(addr));

As noted above your current get_numa_node() implementation is O(N^2)
making this routine O(N^3).

> +        dynamic_memory[5] = (addr < spapr->ram_limit) ?
> +                            cpu_to_be32(LMB_FLAGS_ASSIGNED) :
> +                            cpu_to_be32(0);
> +
> +        memcpy(cur_index, dynamic_memory, sizeof(dynamic_memory));

You could just use uint32_t *dynamic_memory = cur_index at the
beginning of the loop block to avoid this memcpy().

> +        cur_index += DR_LMB_LIST_ENTRY_SIZE;
> +    }
> +    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* ibm,associativity-lookup-arrays */
> +    cur_index = int_buf;
> +    int_buf[0] = cpu_to_be32(nb_numa_nodes);
> +    int_buf[1] = cpu_to_be32(4);

Something explaining the significance of this 4 for those of us that
don't have access to PAPR+ would be nice.

> +    cur_index += 2;
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        uint32_t associativity[] = {
> +            cpu_to_be32(0x0),
> +            cpu_to_be32(0x0),
> +            cpu_to_be32(0x0),
> +            cpu_to_be32(i)
> +        };
> +        memcpy(cur_index, associativity, sizeof(associativity));
> +        cur_index += 4;
> +    }
> +    ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", 
> int_buf,
> +            (cur_index - int_buf) * sizeof(uint32_t));
> +out:
> +    g_free(int_buf);
> +    return ret;
> +}
> +
> +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size,
> +                                bool cpu_update, bool memory_update)
> +{
> +    void *fdt, *fdt_skel;
> +    sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
> +
> +    size -= sizeof(hdr);
> +
> +    /* Create sceleton */
> +    fdt_skel = g_malloc0(size);
> +    _FDT((fdt_create(fdt_skel, size)));
> +    _FDT((fdt_begin_node(fdt_skel, "")));
> +    _FDT((fdt_end_node(fdt_skel)));
> +    _FDT((fdt_finish(fdt_skel)));
> +    fdt = g_malloc0(size);
> +    _FDT((fdt_open_into(fdt_skel, fdt, size)));
> +    g_free(fdt_skel);
> +
> +    /* Fixup cpu nodes */
> +    if (cpu_update) {
> +        _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
> +    }
> +
> +    /* Generate memory nodes or ibm,dynamic-reconfiguration-memory node */
> +    if (memory_update) {
> +        _FDT((spapr_populate_drconf_memory(spapr, fdt)));
> +    } else {
> +        _FDT((spapr_populate_memory(spapr, fdt)));
> +    }
> +
> +    /* Pack resulting tree */
> +    _FDT((fdt_pack(fdt)));
> +
> +    if (fdt_totalsize(fdt) + sizeof(hdr) > size) {

You could just set the correct size (size - sizeof(hdr)) to
fdt_create() and fdt_open_into() and avoid this failure case.

> +        trace_spapr_cas_failed(size);
> +        return -1;
> +    }
> +
> +    cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
> +    cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
> +    trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
> +    g_free(fdt);
> +
> +    return 0;
> +}
> +
>  static void spapr_finalize_fdt(sPAPREnvironment *spapr,
>                                 hwaddr fdt_addr,
>                                 hwaddr rtas_addr,
> @@ -757,11 +904,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
>      /* open out the base tree into a temp buffer for the final tweaks */
>      _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
>  
> -    ret = spapr_populate_memory(spapr, fdt);
> -    if (ret < 0) {
> -        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
> -        exit(1);
> -    }
> +    /*
> +     * Add memory@0 node to represent RMA. Rest of the memory is either
> +     * represented by memory nodes or ibm,dynamic-reconfiguration-memory
> +     * node later during ibm,client-architecture-support call.
> +     */
> +    spapr_populate_memory_node(fdt, 0, 0, spapr->rma_size);
>  
>      ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
>      if (ret < 0) {
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 8651447..10f05f4 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -805,6 +805,32 @@ static target_ulong h_set_mode(PowerPCCPU *cpu, 
> sPAPREnvironment *spapr,
>      return ret;
>  }
>  
> +/*
> + * Return the offset to the requested option vector @vector in the
> + * option vector table @table.
> + */
> +static target_ulong cas_get_option_vector(int vector, target_ulong table)
> +{
> +    int i;
> +    char nr_vectors, nr_entries;
> +
> +    if (!table) {
> +        return 0;
> +    }
> +
> +    nr_vectors = (rtas_ld(table, 0) >> 24) + 1;

This is kind of abusing the rtas_ld() function.  It's really only
intended for accessing rtas arguments, not an arbitrary array of u32s.

> +    if (!vector || vector > nr_vectors) {
> +        return 0;
> +    }
> +    table++; /* skip nr option vectors */
> +
> +    for (i = 0; i < vector - 1; i++) {
> +        nr_entries = rtas_ld(table, 0) >> 24;
> +        table += nr_entries + 2;
> +    }
> +    return table;
> +}
> +
>  typedef struct {
>      PowerPCCPU *cpu;
>      uint32_t cpu_version;
> @@ -825,19 +851,22 @@ static void do_set_compat(void *arg)
>      ((cpuver) == CPU_POWERPC_LOGICAL_2_06_PLUS) ? 2061 : \
>      ((cpuver) == CPU_POWERPC_LOGICAL_2_07) ? 2070 : 0)
>  
> +#define OV5_DRCONF_MEMORY 0x20
> +
>  static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
>                                                    sPAPREnvironment *spapr,
>                                                    target_ulong opcode,
>                                                    target_ulong *args)
>  {
> -    target_ulong list = args[0];
> +    target_ulong list = args[0], ov_table;
>      PowerPCCPUClass *pcc_ = POWERPC_CPU_GET_CLASS(cpu_);
>      CPUState *cs;
> -    bool cpu_match = false;
> +    bool cpu_match = false, cpu_update = true, memory_update = false;
>      unsigned old_cpu_version = cpu_->cpu_version;
>      unsigned compat_lvl = 0, cpu_version = 0;
>      unsigned max_lvl = get_compat_level(cpu_->max_compat);
>      int counter;
> +    char ov5_byte2;
>  
>      /* Parse PVR list */
>      for (counter = 0; counter < 512; ++counter) {
> @@ -887,8 +916,6 @@ static target_ulong 
> h_client_architecture_support(PowerPCCPU *cpu_,
>          }
>      }
>  
> -    /* For the future use: here @list points to the first capability */
> -
>      /* Parsing finished */
>      trace_spapr_cas_pvr(cpu_->cpu_version, cpu_match,
>                          cpu_version, pcc_->pcr_mask);
> @@ -912,14 +939,26 @@ static target_ulong 
> h_client_architecture_support(PowerPCCPU *cpu_,
>      }
>  
>      if (!cpu_version) {
> -        return H_SUCCESS;
> +        cpu_update = false;
>      }
>  
> +    /* For the future use: here @ov_table points to the first option vector 
> */
> +    ov_table = list;
> +
> +    list = cas_get_option_vector(5, ov_table);

What's the literal 5 mean?

>      if (!list) {
>          return H_SUCCESS;
>      }
>  
> -    if (spapr_h_cas_compose_response(args[1], args[2])) {
> +    /* @list now points to OV 5 */
> +    list += 2;
> +    ov5_byte2 = rtas_ld(list, 0) >> 24;
> +    if (ov5_byte2 & OV5_DRCONF_MEMORY) {
> +        memory_update = true;
> +    }
> +
> +    if (spapr_h_cas_compose_response(args[1], args[2], cpu_update,
> +                                    memory_update)) {
>          qemu_system_reset_request();
>      }
>  
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 64681c4..10283f9 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -485,9 +485,19 @@ struct sPAPRTCETable {
>  /* Support a min of 1TB hotplug memory assuming 256MB per slot */
>  #define SPAPR_MAX_RAM_SLOTS     (1ULL << 12)
>  
> +/*
> + * Number of 32 bit words in each LMB list entry in ibm,dynamic-memory
> + * property under ibm,dynamic-reconfiguration-memory node.
> + */
> +#define DR_LMB_LIST_ENTRY_SIZE 6
> +
> +/* Flag values in Option Vector 5 ibm architecture vector table. */
> +#define LMB_FLAGS_ASSIGNED 0x00000008

Things declared in a public header should have some sort of
namespacing, so, SPAPR_DR_LBM_LIST_ENTRY_SIZE for example.

>  void spapr_events_init(sPAPREnvironment *spapr);
>  void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
> -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size);
> +int spapr_h_cas_compose_response(target_ulong addr, target_ulong size,
> +                                bool cpu_update, bool memory_update);
>  sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
>                                     uint64_t bus_offset,
>                                     uint32_t page_shift,

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

pgpv6Inyv7key.pgp
Description: PGP signature

Re: [Qemu-devel] [RFC PATCH v1 12/13] spapr: Support ibm, dynamic-reconfiguration-memory

Reply via email to