On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <da...@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> >> The associativity details of the newly added resourced are collected from
> >> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> >> distance details of the newly added numa node after the above call.
> >> 
> >> Instead of updating NUMA distance every time we lookup a node id
> >> from the associativity property, add helpers that can be used
> >> during boot which does this only once. Also remove the distance
> >> update from node id lookup helpers.
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.ibm.com>
> >> ---
> >>  arch/powerpc/mm/numa.c                        | 173 +++++++++++++-----
> >>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
> >>  .../platforms/pseries/hotplug-memory.c        |   2 +
> >>  arch/powerpc/platforms/pseries/pseries.h      |   1 +
> >>  4 files changed, 132 insertions(+), 46 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 0ec16999beef..7b142f79d600 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
> >>  }
> >>  EXPORT_SYMBOL(__node_distance);
> >>  
> >> -static void initialize_distance_lookup_table(int nid,
> >> -          const __be32 *associativity)
> >> -{
> >> -  int i;
> >> -
> >> -  if (affinity_form != FORM1_AFFINITY)
> >> -          return;
> >> -
> >> -  for (i = 0; i < max_associativity_domain_index; i++) {
> >> -          const __be32 *entry;
> >> -
> >> -          entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> >> -          distance_lookup_table[nid][i] = of_read_number(entry, 1);
> >> -  }
> >> -}
> >> -
> >>  /*
> >>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> >>   * info is found.
> >> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
> >> *associativity)
> >>    /* POWER4 LPAR uses 0xffff as invalid node */
> >>    if (nid == 0xffff || nid >= nr_node_ids)
> >>            nid = NUMA_NO_NODE;
> >> -
> >> -  if (nid > 0 &&
> >> -          of_read_number(associativity, 1) >= 
> >> max_associativity_domain_index) {
> >> -          /*
> >> -           * Skip the length field and send start of associativity array
> >> -           */
> >> -          initialize_distance_lookup_table(nid, associativity + 1);
> >> -  }
> >> -
> >>  out:
> >>    return nid;
> >>  }
> >> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
> >>  }
> >>  EXPORT_SYMBOL(of_node_to_nid);
> >>  
> >> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> >> +{
> >> +  int i, nid;
> >> +
> >> +  if (affinity_form != FORM1_AFFINITY)
> >
> > Since this shouldn't be called on a !form1 system, this could be a 
> > WARN_ON().
> 
> The way we call functions currently, instead of doing
> 
> if (affinity_form == FORM1_AFFINITY)
>     __initialize_form1_numa_distance()
> 
> We avoid doing the if check in multiple places. For example
> parse_numa_properties will fetch the associativity array to find the
> details of online node and set it online. We use the same code path to
> initialize distance.
> 
>               if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>                       nid = associativity_to_nid(vphn_assoc);
>                       __initialize_form1_numa_distance(vphn_assoc);
>               } else {
> 
>                       cpu = of_get_cpu_node(i, NULL);
>                       BUG_ON(!cpu);
> 
>                       associativity = of_get_associativity(cpu);
>                       if (associativity) {
>                               nid = associativity_to_nid(associativity);
>                               __initialize_form1_numa_distance(associativity);
>                       }
> 
> We avoid the the if (affinity_form == FORM1_AFFINITY) check there by
> moving the check inside __initialize_form1_numa_distance().

Oh.. ok.  The only caller I spotted was already doing a test against
affinity_form.

> >> +          return;
> >> +
> >> +  if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +          nid = of_read_number(&associativity[primary_domain_index], 1);
> >
> > This computes the nid from the assoc array independently of
> > associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
> > it be better to call assocaitivity_to_nid(), then make the next bit
> > conditional on nid !== NUMA_NO_NODE?
> 
> @@ -302,9 +302,8 @@ static void __initialize_form1_numa_distance(const __be32 
> *associativity)
>       if (affinity_form != FORM1_AFFINITY)
>               return;
>  
> -     if (of_read_number(associativity, 1) >= primary_domain_index) {
> -             nid = of_read_number(&associativity[primary_domain_index], 1);
> -
> +     nid = associativity_to_nid(associativity);
> +     if (nid != NUMA_NO_NODE) {
>               for (i = 0; i < distance_ref_points_depth; i++) {
>                       const __be32 *entry;

Right.

> >
> >> +
> >> +          for (i = 0; i < max_associativity_domain_index; i++) {
> >> +                  const __be32 *entry;
> >> +
> >> +                  entry = 
> >> &associativity[be32_to_cpu(distance_ref_points[i])];
> >> +                  distance_lookup_table[nid][i] = of_read_number(entry, 
> >> 1);
> >> +          }
> >> +  }
> >> +}
> >> +
> >> +static void initialize_form1_numa_distance(struct device_node *node)
> >> +{
> >> +  const __be32 *associativity;
> >> +
> >> +  associativity = of_get_associativity(node);
> >> +  if (!associativity)
> >> +          return;
> >> +
> >> +  __initialize_form1_numa_distance(associativity);
> >> +}
> >> +
> >> +/*
> >> + * Used to update distance information w.r.t newly added node.
> >> + */
> >> +void update_numa_distance(struct device_node *node)
> >> +{
> >> +  if (affinity_form == FORM0_AFFINITY)
> >> +          return;
> >> +  else if (affinity_form == FORM1_AFFINITY) {
> >> +          initialize_form1_numa_distance(node);
> >> +          return;
> >> +  }
> >> +}
> >> +
> >>  static int __init find_primary_domain_index(void)
> >>  {
> >>    int index;
> >> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays 
> >> *aa)
> >>    return 0;
> >>  }
> >>  
> >> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> >> +{
> >> +  struct assoc_arrays aa = { .arrays = NULL };
> >> +  int default_nid = NUMA_NO_NODE;
> >> +  int nid = default_nid;
> >> +  int rc, index;
> >> +
> >> +  if ((primary_domain_index < 0) || !numa_enabled)
> >
> > Under what circumstances could you get primary_domain_index < 0?
> 
> IIUC that is to handle failure to parse device tree.
> ea9f5b702fe0215188fba2eda117419e4ae90a67

Ok.

> >
> >> +          return default_nid;

Returning NUMA_NO_NODE explicitly, rather than an alias to it might be
clearer here, but it's not a big detail.

> >> +
> >> +  rc = of_get_assoc_arrays(&aa);
> >> +  if (rc)
> >> +          return default_nid;
> >> +
> >> +  if (primary_domain_index <= aa.array_sz &&
> >> +      !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> >> aa.n_arrays) {
> >> +          index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> >
> > Does anywhere verify that primary_domain_index <= aa.array_sz?
> 
> That is the first part of the check?

Oh, sorry, missed that.  I think I was expecting it to be an early
exit, rather than folded into the rest of this complex condition.

> 
> >
> >> +          nid = of_read_number(&aa.arrays[index], 1);
> >> +
> >> +          if (nid == 0xffff || nid >= nr_node_ids)
> >> +                  nid = default_nid;
> >> +          if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> >> +                  int i;
> >> +                  const __be32 *associativity;
> >> +
> >> +                  index = lmb->aa_index * aa.array_sz;
> >> +                  associativity = &aa.arrays[index];
> >> +                  /*
> >> +                   * lookup array associativity entries have different 
> >> format
> >> +                   * There is no length of the array as the first element.
> >
> > The difference it very small, and this is not a hot path.  Couldn't
> > you reduce a chunk of code by prepending aa.array_sz, then re-using
> > __initialize_form1_numa_distance.  Or even making
> > __initialize_form1_numa_distance() take the length as a parameter.
> 
> The changes are small but confusing w.r.t how we look at the
> associativity-lookup-arrays. The way we interpret associativity array
> and associativity lookup array using primary_domain_index is different.
> Hence the '-1' in the node lookup here.

They're really not, though.  It's exactly the same interpretation of
the associativity array itself - it's just that one of them has the
array prepended with a (redundant) length.  So you can make
__initialize_form1_numa_distance() work on the "bare" associativity
array, with a given length.  Here you call it with aa.array_sz as the
length, and in the other place you call it with prop[0] as the length.

> 
>       index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>       nid = of_read_number(&aa.arrays[index], 1);
> 
> 
> >
> >> +                   */
> >> +                  for (i = 0; i < max_associativity_domain_index; i++) {
> >> +                          const __be32 *entry;
> >> +
> >> +                          entry = 
> >> &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> >
> > Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> > every i?
> 
> We do check for 
> 
>       if (primary_domain_index <= aa.array_sz &&

Right, but that doesn't check the other distance_ref_points entries.
Not that there's any reason to have extra entries with Form2, but we
still don't want stray array accesses.

> 
> >
> >> +                          distance_lookup_table[nid][i] = 
> >> of_read_number(entry, 1);
> >> +                  }
> >> +          }
> >> +  }
> >> +  return nid;
> >> +}
> >> +
> >>  /*
> >>   * This is like of_node_to_nid_single() for memory represented in the
> >>   * ibm,dynamic-reconfiguration-memory node.
> >> @@ -458,21 +518,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
> >>  
> >>            if (nid == 0xffff || nid >= nr_node_ids)
> >>                    nid = default_nid;
> >> -
> >> -          if (nid > 0) {
> >> -                  index = lmb->aa_index * aa.array_sz;
> >> -                  initialize_distance_lookup_table(nid,
> >> -                                                  &aa.arrays[index]);
> >> -          }
> >>    }
> >> -
> >>    return nid;
> >>  }
> >>  
> >>  #ifdef CONFIG_PPC_SPLPAR
> >> -static int vphn_get_nid(long lcpu)
> >> +
> >> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
> >>  {
> >> -  __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> >>    long rc, hwid;
> >>  
> >>    /*
> >> @@ -492,10 +545,22 @@ static int vphn_get_nid(long lcpu)
> >>  
> >>            rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
> >>            if (rc == H_SUCCESS)
> >> -                  return associativity_to_nid(associativity);
> >> +                  return 0;
> >>    }
> >>  
> >> +  return -1;
> >> +}
> >> +
> >> +static int vphn_get_nid(long lcpu)
> >> +{
> >> +  __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> >> +
> >> +
> >> +  if (!__vphn_get_associativity(lcpu, associativity))
> >> +          return associativity_to_nid(associativity);
> >> +
> >>    return NUMA_NO_NODE;
> >> +
> >>  }
> >>  #else
> >>  static int vphn_get_nid(long unused)
> >> @@ -692,7 +757,7 @@ static int __init numa_setup_drmem_lmb(struct 
> >> drmem_lmb *lmb,
> >>                    size = read_n_cells(n_mem_size_cells, usm);
> >>            }
> >>  
> >> -          nid = of_drconf_to_nid_single(lmb);
> >> +          nid = get_nid_and_numa_distance(lmb);
> >>            fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
> >>                                      &nid);
> >>            node_set_online(nid);
> >> @@ -709,6 +774,7 @@ static int __init parse_numa_properties(void)
> >>    struct device_node *memory;
> >>    int default_nid = 0;
> >>    unsigned long i;
> >> +  const __be32 *associativity;
> >>  
> >>    if (numa_enabled == 0) {
> >>            printk(KERN_WARNING "NUMA disabled by user\n");
> >> @@ -734,18 +800,30 @@ static int __init parse_numa_properties(void)
> >>     * each node to be onlined must have NODE_DATA etc backing it.
> >>     */
> >>    for_each_present_cpu(i) {
> >> +          __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
> >>            struct device_node *cpu;
> >> -          int nid = vphn_get_nid(i);
> >> +          int nid = NUMA_NO_NODE;
> >>  
> >> -          /*
> >> -           * Don't fall back to default_nid yet -- we will plug
> >> -           * cpus into nodes once the memory scan has discovered
> >> -           * the topology.
> >> -           */
> >> -          if (nid == NUMA_NO_NODE) {
> >> +          memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> >
> > What's the memset() for?  AFAICT you only look at vphn_assoc in the
> > branch where __vphn_get_associativity() succeeds.
> 
> That was done to match the existing code. We do use a zero filled array
> when making that hcall in this code path. I don't see us doing that
> everywhere. But didn't want to change that behaviour in this patch.
> 
> -static int vphn_get_nid(long lcpu)
> +
> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>  {
>  -    __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>       long rc, hwid;

Ok, that makes sense.

> 
> >
> >> +
> >> +          if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> >> +                  nid = associativity_to_nid(vphn_assoc);
> >> +                  __initialize_form1_numa_distance(vphn_assoc);
> >> +          } else {
> >> +
> >> +                  /*
> >> +                   * Don't fall back to default_nid yet -- we will plug
> >> +                   * cpus into nodes once the memory scan has discovered
> >> +                   * the topology.
> >> +                   */
> >>                    cpu = of_get_cpu_node(i, NULL);
> >>                    BUG_ON(!cpu);
> >> -                  nid = of_node_to_nid_single(cpu);
> >> +
> >> +                  associativity = of_get_associativity(cpu);
> >> +                  if (associativity) {
> >> +                          nid = associativity_to_nid(associativity);
> >> +                          __initialize_form1_numa_distance(associativity);
> >> +                  }
> >>                    of_node_put(cpu);
> >>            }
> >>  
> >> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
> >>             * have associativity properties.  If none, then
> >>             * everything goes to default_nid.
> >>             */
> >> -          nid = of_node_to_nid_single(memory);
> >> -          if (nid < 0)
> >> +          associativity = of_get_associativity(memory);
> >> +          if (associativity) {
> >> +                  nid = associativity_to_nid(associativity);
> >> +                  __initialize_form1_numa_distance(associativity);
> >> +          } else
> >>                    nid = default_nid;
> >>  
> >>            fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> >> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> index 7e970f81d8ff..778b6ab35f0d 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> >> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
> >>            return saved_rc;
> >>    }
> >>  
> >> +  update_numa_distance(dn);
> >> +
> >>    rc = dlpar_online_cpu(dn);
> >>    if (rc) {
> >>            saved_rc = rc;
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> >> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> index 36f66556a7c6..40d350f31a34 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> >> drmem_lmb *lmb)
> >>            return -ENODEV;
> >>    }
> >>  
> >> +  update_numa_distance(lmb_node);
> >> +
> >>    dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
> >>    if (!dr_node) {
> >>            dlpar_free_cc_nodes(lmb_node);
> >> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> >> b/arch/powerpc/platforms/pseries/pseries.h
> >> index 1f051a786fb3..663a0859cf13 100644
> >> --- a/arch/powerpc/platforms/pseries/pseries.h
> >> +++ b/arch/powerpc/platforms/pseries/pseries.h
> >> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
> >>  void pseries_setup_security_mitigations(void);
> >>  void pseries_lpar_read_hblkrm_characteristics(void);
> >>  
> >> +void update_numa_distance(struct device_node *node);
> >>  #endif /* _PSERIES_PSERIES_H */
> >
> 

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to