Re: [PATCH] powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry

2021-08-09 Thread Christophe Leroy




Le 10/08/2021 à 06:53, Aneesh Kumar K.V a écrit :

Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling.

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/mm/book3s64/radix_tlb.c | 48 
  1 file changed, 48 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..5cca0fe130e7 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -17,6 +17,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include "internal.h"
  
@@ -1524,3 +1525,50 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,

  EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
  
  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

+
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+size_t count, loff_t *ppos)
+{
+   char buf[32];
+   unsigned int len;
+
+   len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
+   return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+const char __user *user_buf, size_t count, loff_t *ppos)
+{
+   char buf[32];
+   ssize_t len;
+   int ceiling;
+
+   len = min(count, sizeof(buf) - 1);
+   if (copy_from_user(buf, user_buf, len))
+   return -EFAULT;
+
+   buf[len] = '\0';
+   if (kstrtoint(buf, 0, &ceiling))
+   return -EINVAL;
+
+   if (ceiling < 0)
+   return -EINVAL;
+
+   tlb_single_page_flush_ceiling = ceiling;
+   return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+   .read = tlbflush_read_file,
+   .write = tlbflush_write_file,
+   .llseek = default_llseek,
+};
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+   debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
+   powerpc_debugfs_root, NULL, &fops_tlbflush);


Could you just use debugfs_create_u32() instead of re-implementing simple read 
and write ?

Or at least use DEFINE_DEBUGFS_ATTRIBUTE() if you need something a bit more 
elaborated ?


+   return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+



[PATCH] powerpc: rename powerpc_debugfs_root to arch_debugfs_dir

2021-08-09 Thread Aneesh Kumar K.V
No functional change in this patch. arch_debugfs_dir is the generic kernel
name declared in linux/debugfs.h for arch-specific debugfs directory.
Architectures like x86/s390 already use the name. Rename powerpc
specific powerpc_debugfs_root to arch_debugfs_dir.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/debugfs.h  | 13 -
 arch/powerpc/kernel/Makefile|  3 ++-
 arch/powerpc/kernel/dawr.c  |  3 +--
 arch/powerpc/kernel/eeh.c   | 16 
 arch/powerpc/kernel/eeh_cache.c |  4 ++--
 arch/powerpc/kernel/fadump.c|  4 ++--
 arch/powerpc/kernel/hw_breakpoint.c |  1 -
 arch/powerpc/kernel/kdebugfs.c  | 14 ++
 arch/powerpc/kernel/security.c  | 16 
 arch/powerpc/kernel/setup-common.c  | 13 -
 arch/powerpc/kernel/setup_64.c  |  1 -
 arch/powerpc/kernel/traps.c |  4 ++--
 arch/powerpc/kvm/book3s_xics.c  |  6 +++---
 arch/powerpc/kvm/book3s_xive.c  |  3 +--
 arch/powerpc/kvm/book3s_xive_native.c   |  3 +--
 arch/powerpc/mm/book3s64/hash_utils.c   |  4 ++--
 arch/powerpc/mm/book3s64/pgtable.c  |  4 ++--
 arch/powerpc/mm/book3s64/radix_tlb.c|  4 ++--
 arch/powerpc/mm/ptdump/bats.c   |  4 ++--
 arch/powerpc/mm/ptdump/segment_regs.c   |  4 ++--
 arch/powerpc/platforms/cell/axon_msi.c  |  4 ++--
 arch/powerpc/platforms/powernv/memtrace.c   |  3 +--
 arch/powerpc/platforms/powernv/opal-imc.c   |  4 ++--
 arch/powerpc/platforms/powernv/opal-lpc.c   |  4 ++--
 arch/powerpc/platforms/powernv/opal-xscom.c |  4 ++--
 arch/powerpc/platforms/powernv/pci-ioda.c   |  4 ++--
 arch/powerpc/platforms/pseries/dtl.c|  4 ++--
 arch/powerpc/platforms/pseries/lpar.c   |  5 +++--
 arch/powerpc/sysdev/xive/common.c   |  3 +--
 arch/powerpc/xmon/xmon.c|  6 +++---
 30 files changed, 74 insertions(+), 91 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/debugfs.h
 create mode 100644 arch/powerpc/kernel/kdebugfs.c

diff --git a/arch/powerpc/include/asm/debugfs.h 
b/arch/powerpc/include/asm/debugfs.h
deleted file mode 100644
index 2c5c48571d75..
--- a/arch/powerpc/include/asm/debugfs.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_DEBUGFS_H
-#define _ASM_POWERPC_DEBUGFS_H
-
-/*
- * Copyright 2017, Michael Ellerman, IBM Corporation.
- */
-
-#include 
-
-extern struct dentry *powerpc_debugfs_root;
-
-#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f66b63e81c3b..7be36c1e1db6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \
   prom.o traps.o setup-common.o \
   udbg.o misc.o io.o misc_$(BITS).o \
   of_platform.o prom_parse.o firmware.o \
-  hw_breakpoint_constraints.o interrupt.o
+  hw_breakpoint_constraints.o interrupt.o \
+  kdebugfs.o
 obj-y  += ptrace/
 obj-$(CONFIG_PPC64)+= setup_64.o \
   paca.o nvram_64.o note.o
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index cdc2dccb987d..64e423d2fe0f 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -9,7 +9,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -101,7 +100,7 @@ static int __init dawr_force_setup(void)
if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) {
/* Turn DAWR off by default, but allow admin to turn it on */
debugfs_create_file_unsafe("dawr_enable_dangerous", 0600,
-  powerpc_debugfs_root,
+  arch_debugfs_dir,
   &dawr_force_enable,
   &dawr_enable_fops);
}
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bbdcc86d01b..e9b597ed423c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -21,9 +21,9 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void)
proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
 #ifdef CONFIG_DEBUG_FS
debugfs_create_file_unsafe("eeh_enable", 0600,
-  powerpc_debugfs_root, NULL,
+  arch_debugfs_dir, NULL,
   &eeh_enable_dbgfs_ops);
debugfs_

[PATCH] powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry

2021-08-09 Thread Aneesh Kumar K.V
Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 48 
 1 file changed, 48 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index aefc100d79a7..5cca0fe130e7 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -1524,3 +1525,50 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned 
long lpid,
 EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
 
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+size_t count, loff_t *ppos)
+{
+   char buf[32];
+   unsigned int len;
+
+   len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
+   return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+const char __user *user_buf, size_t count, loff_t *ppos)
+{
+   char buf[32];
+   ssize_t len;
+   int ceiling;
+
+   len = min(count, sizeof(buf) - 1);
+   if (copy_from_user(buf, user_buf, len))
+   return -EFAULT;
+
+   buf[len] = '\0';
+   if (kstrtoint(buf, 0, &ceiling))
+   return -EINVAL;
+
+   if (ceiling < 0)
+   return -EINVAL;
+
+   tlb_single_page_flush_ceiling = ceiling;
+   return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+   .read = tlbflush_read_file,
+   .write = tlbflush_write_file,
+   .llseek = default_llseek,
+};
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+   debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
+   powerpc_debugfs_root, NULL, &fops_tlbflush);
+   return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
+
-- 
2.31.1



Re: [PATCH v7 3/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-08-09 Thread David Gibson
On Mon, Aug 09, 2021 at 10:54:31AM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call.
> 
> Instead of updating NUMA distance every time we lookup a node id
> from the associativity property, add helpers that can be used
> during boot which does this only once. Also remove the distance
> update from node id lookup helpers.
> 
> Signed-off-by: Aneesh Kumar K.V 

It would make review easier if you'd folded the fixes/cleanups from
6/6 in here, rather than having them separate.

> ---
>  arch/powerpc/include/asm/topology.h   |   2 +
>  arch/powerpc/mm/numa.c| 178 +-
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>  .../platforms/pseries/hotplug-memory.c|   2 +
>  4 files changed, 138 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index e4db64c0e184..a6425a70c37b 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu)
>  }
>  
>  int of_drconf_to_nid_single(struct drmem_lmb *lmb);
> +void update_numa_distance(struct device_node *node);
>  
>  #else
>  
> @@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb 
> *lmb)
>   return first_online_node;
>  }
>  
> +static inline void update_numa_distance(struct device_node *node) {}
>  #endif /* CONFIG_NUMA */
>  
>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 368719b14dcc..c695faf67d68 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>  }
>  EXPORT_SYMBOL(__node_distance);
>  
> -static void initialize_distance_lookup_table(int nid,
> - const __be32 *associativity)
> -{
> - int i;
> -
> - if (affinity_form != FORM1_AFFINITY)
> - return;
> -
> - for (i = 0; i < distance_ref_points_depth; i++) {
> - const __be32 *entry;
> -
> - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> - distance_lookup_table[nid][i] = of_read_number(entry, 1);
> - }
> -}
> -
>  /*
>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>   * info is found.
> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
>   nid = NUMA_NO_NODE;
> -
> - if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> - /*
> -  * Skip the length field and send start of associativity array

So.. this highlights a point.  In your comments you're treating
"associativity array" as meaning the value including the length plus
the assoc IDs, but this comment you've removed is treating it as just
the assoc IDs.  Personally I think things become clearer to talk about
if you treat the canonical form of "associativity array" as just the
assoc IDs (with implicit length), then treat the ibm,associativity
properties as containing (length + associativity array).

> -  */
> - initialize_distance_lookup_table(nid, associativity + 1);
> - }
> -
>  out:
>   return nid;
>  }
> @@ -287,6 +262,48 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> + int i, nid;
> +
> + if (affinity_form != FORM1_AFFINITY)
> + return;
> +
> + nid = associativity_to_nid(associativity);
> + if (nid != NUMA_NO_NODE) {
> + for (i = 0; i < distance_ref_points_depth; i++) {
> + const __be32 *entry;
> +
> + entry = 
> &associativity[be32_to_cpu(distance_ref_points[i])];

So, here you've now removed all checks that distance_ref_points[i]
doesn't overflow the length of associativity.  It comes back in 6/6,
but it's still not ideal.

> + distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
> + }
> + }
> +}
> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> + const __be32 *associativity;
> +
> + associativity = of_get_associativity(node);
> + if (!associativity)
> + return;
> +
> + __initialize_form1_numa_distance(associativity);
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> + if (affinity_form == FORM0_AFFINITY)
> + return;
> + else if (affinity_form == FORM1_AFFIN

Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity

2021-08-09 Thread David Gibson
On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote:
> PAPR interface currently supports two different ways of communicating resource
> grouping details to the OS. These are referred to as Form 0 and Form 1
> associativity grouping. Form 0 is the older format and is now considered
> deprecated. This patch adds another resource grouping named FORM2.
> 
> Signed-off-by: Daniel Henrique Barboza 
> Signed-off-by: Aneesh Kumar K.V 

LGTM, with the exception of some minor nits noted below.

> ---
>  Documentation/powerpc/associativity.rst   | 103 +
>  arch/powerpc/include/asm/firmware.h   |   3 +-
>  arch/powerpc/include/asm/prom.h   |   1 +
>  arch/powerpc/kernel/prom_init.c   |   3 +-
>  arch/powerpc/mm/numa.c| 168 ++
>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>  6 files changed, 252 insertions(+), 27 deletions(-)
>  create mode 100644 Documentation/powerpc/associativity.rst
> 
> diff --git a/Documentation/powerpc/associativity.rst 
> b/Documentation/powerpc/associativity.rst
> new file mode 100644
> index ..b6c89706ca03
> --- /dev/null
> +++ b/Documentation/powerpc/associativity.rst
> @@ -0,0 +1,103 @@
> +
> +NUMA resource associativity
> +=
> +
> +Associativity represents the groupings of the various platform resources into
> +domains of substantially similar mean performance relative to resources 
> outside
> +of that domain. Resources subsets of a given domain that exhibit better
> +performance relative to each other than relative to other resources subsets
> +are represented as being members of a sub-grouping domain. This performance
> +characteristic is presented in terms of NUMA node distance within the Linux 
> kernel.
> +From the platform view, these groups are also referred to as domains.
> +
> +PAPR interface currently supports different ways of communicating these 
> resource
> +grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> +associativity grouping. Form 0 is the oldest format and is now considered 
> deprecated.
> +
> +Hypervisor indicates the type/form of associativity used via 
> "ibm,architecture-vec-5 property".
> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
> Form 0 or Form 1.
> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> associativity
> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> +
> +Form 0
> +-
> +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
> +
> +Form 1
> +-
> +With Form 1 a combination of ibm,associativity-reference-points, and 
> ibm,associativity
> +device tree properties are used to determine the NUMA distance between 
> resource groups/domains.
> +
> +The “ibm,associativity” property contains a list of one or more numbers 
> (domainID)
> +representing the resource’s platform grouping domains.
> +
> +The “ibm,associativity-reference-points” property contains a list of one or 
> more numbers
> +(domainID index) that represents the 1 based ordinal in the associativity 
> lists.
> +The list of domainID indexes represents an increasing hierarchy of resource 
> grouping.
> +
> +ex:
> +{ primary domainID index, secondary domainID index, tertiary domainID 
> index.. }
> +
> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
> node id.
> +Linux kernel computes NUMA distance between two domains by recursively 
> comparing
> +if they belong to the same higher-level domains. For mismatch at every higher
> +level of the resource group, the kernel doubles the NUMA distance between the
> +comparing domains.
> +
> +Form 2
> +---
> +Form 2 associativity format adds separate device tree properties 
> representing NUMA node distance
> +thereby making the node distance computation flexible. Form 2 also allows 
> flexible primary
> +domain numbering. With numa distance computation now detached from the index 
> value in
> +"ibm,associativity-reference-points" property, Form 2 allows a large number 
> of primary domain
> +ids at the same domainID index representing resource groups of different 
> performance/latency
> +characteristics.
> +
> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 
> in the
> +"ibm,architecture-vec-5" property.
> +
> +"ibm,numa-lookup-index-table" property contains a list of one or more 
> numbers representing
> +the domainIDs present in the system. The offset of the domainID in this 
> property is
> +used as an index while computing numa distance information via 
> "ibm,numa-distance-table".
> +
> +prop-encoded-array: The number N of the domainIDs encoded as with 
> encode-int, followed by
> +N domainID encoded as with encode-int
> +
> +For ex:
> +"ibm,numa-lookup-index-table" =  {4, 0, 8, 250, 252}. The offset of domainID 
> 8 (2) is used when
> +computing the distance of domain 8 from other domains present 

Re: [PATCH v7 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-09 Thread David Gibson
On Mon, Aug 09, 2021 at 10:54:34AM +0530, Aneesh Kumar K.V wrote:
> Currently, we duplicate parsing code for ibm,associativity and
> ibm,associativity-lookup-arrays in the kernel. The associativity array 
> provided
> by these device tree properties are very similar and hence can use
> a helper to parse the node id and numa distance details.
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

Though I'd prefer to see these fixes folded in with the earlier patch.

> ---
>  arch/powerpc/mm/numa.c | 104 +++--
>  1 file changed, 58 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fffb3c40f595..e6d47fcba335 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -171,26 +171,36 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -/*
> - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> - * info is found.
> - */
> -static int associativity_to_nid(const __be32 *associativity)
> +static int __associativity_to_nid(const __be32 *associativity,
> +   int max_array_sz)
>  {
> - int nid = NUMA_NO_NODE;
> + int nid;
> + /*
> +  * primary_domain_index is 1 based array index.
> +  */
> + int index = primary_domain_index  - 1;
>  
> - if (!numa_enabled)
> - goto out;
> + if (!numa_enabled || index >= max_array_sz)
> + return NUMA_NO_NODE;
>  
> - if (of_read_number(associativity, 1) >= primary_domain_index)
> - nid = of_read_number(&associativity[primary_domain_index], 1);
> + nid = of_read_number(&associativity[index], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
>   nid = NUMA_NO_NODE;
> -out:
>   return nid;
>  }
> +/*
> + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> + * info is found.
> + */
> +static int associativity_to_nid(const __be32 *associativity)
> +{
> + int array_sz = of_read_number(associativity, 1);
> +
> + /* Skip the first element in the associativity array */
> + return __associativity_to_nid((associativity + 1), array_sz);
> +}
>  
>  static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
> @@ -295,33 +305,39 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static void __initialize_form1_numa_distance(const __be32 *associativity)
> +static void __initialize_form1_numa_distance(const __be32 *associativity,
> +  int max_array_sz)
>  {
>   int i, nid;
>  
>   if (affinity_form != FORM1_AFFINITY)
>   return;
>  
> - nid = associativity_to_nid(associativity);
> + nid = __associativity_to_nid(associativity, max_array_sz);
>   if (nid != NUMA_NO_NODE) {
>   for (i = 0; i < distance_ref_points_depth; i++) {
>   const __be32 *entry;
> + int index = be32_to_cpu(distance_ref_points[i]) - 1;
> +
> + /*
> +  * broken hierarchy, return with broken distance table
> +  */
> + if (WARN(index >= max_array_sz, "Broken 
> ibm,associativity property"))
> + return;
>  
> - entry = 
> &associativity[be32_to_cpu(distance_ref_points[i])];
> + entry = &associativity[index];
>   distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
>   }
>   }
>  }
>  
> -static void initialize_form1_numa_distance(struct device_node *node)
> +static void initialize_form1_numa_distance(const __be32 *associativity)
>  {
> - const __be32 *associativity;
> -
> - associativity = of_get_associativity(node);
> - if (!associativity)
> - return;
> + int array_sz;
>  
> - __initialize_form1_numa_distance(associativity);
> + array_sz = of_read_number(associativity, 1);
> + /* Skip the first element in the associativity array */
> + __initialize_form1_numa_distance(associativity + 1, array_sz);
>  }
>  
>  /*
> @@ -334,7 +350,13 @@ void update_numa_distance(struct device_node *node)
>   if (affinity_form == FORM0_AFFINITY)
>   return;
>   else if (affinity_form == FORM1_AFFINITY) {
> - initialize_form1_numa_distance(node);
> + const __be32 *associativity;
> +
> + associativity = of_get_associativity(node);
> + if (!associativity)
> + return;
> +
> + initialize_form1_numa_distance(associativity);
>   return;
>   }
>  
> @@ -586,27 +608,17 @@ static int get_nid_and_numa_distance(struct drmem_lmb 
> *lmb)
>  
>   if (primary_domain_index <= aa.array_sz &&
>   !(lmb->flags &

Re: [PATCH v2] scripts/Makefile.clang: default to LLVM_IAS=1

2021-08-09 Thread Masahiro Yamada
On Sat, Aug 7, 2021 at 4:53 AM Nathan Chancellor  wrote:
>
> On Fri, Aug 06, 2021 at 10:27:01AM -0700, Nick Desaulniers wrote:
> > LLVM_IAS=1 controls enabling clang's integrated assembler via
> > -integrated-as. This was an explicit opt in until we could enable
> > assembler support in Clang for more architecures. Now we have support
> > and CI coverage of LLVM_IAS=1 for all architecures except a few more
> > bugs affecting s390 and powerpc.
>
> The powerpc and s390 folks have been testing with clang, I think they
> should have been on CC for this change (done now).
>
> > This commit flips the default from opt in via LLVM_IAS=1 to opt out via
> > LLVM_IAS=0.  CI systems or developers that were previously doing builds
> > with CC=clang or LLVM=1 without explicitly setting LLVM_IAS must now
> > explicitly opt out via LLVM_IAS=0, otherwise they will be implicitly
> > opted-in.
> >
> > This finally shortens the command line invocation when cross compiling
> > with LLVM to simply:
> >
> > $ make ARCH=arm64 LLVM=1
> >
> > Signed-off-by: Nick Desaulniers 
>
> I am still not really sure how I feel about this. I would prefer not to
> break people's builds but I suppose this is inevitabile eventually.
>
> A little support matrix that I drafted up where based on ARCH and clang
> version for LLVM_IAS=1 support:
>
>  | 10.x | 11.x | 12.x | 13.x | 14.x |
> ARCH=arm |  NO  |  NO  |  NO  |  YES |  YES |
> ARCH=arm64   |  NO  |  YES |  YES |  YES |  YES |
> ARCH=i386|  YES |  YES |  YES |  YES |  YES |
> ARCH=mips*   |  YES |  YES |  YES |  YES |  YES |
> ARCH=powerpc |  NO  |  NO  |  NO  |  NO  |  NO  |
> ARCH=s390|  NO  |  NO  |  NO  |  NO  |  NO  |
> ARCH=x86_64  |  NO  |  YES |  YES |  YES |  YES |
>
> The main issue that I have with this change is that all of these
> architectures work fine with CC=clang and their build commands that used
> to work fine will not with this change, as they will have to specify
> LLVM_IAS=0. I think that making this change for LLVM=1 makes sense but
> changing the default for just CC=clang feels like a bit much at this
> point in time. I would love to hear from others on this though, I am not
> going to object much further than this.
>
> Regardless of that concern, this patch does what it says so:
>
> Reviewed-by: Nathan Chancellor 


Applied to linux-kbuild.
Thanks.


-- 
Best Regards
Masahiro Yamada


Re: [PATCH 00/11] Implement generic prot_guest_has() helper function

2021-08-09 Thread Tom Lendacky
On 8/8/21 8:41 PM, Kuppuswamy, Sathyanarayanan wrote:
> Hi Tom,
> 
> On 7/27/21 3:26 PM, Tom Lendacky wrote:
>> This patch series provides a generic helper function, prot_guest_has(),
>> to replace the sme_active(), sev_active(), sev_es_active() and
>> mem_encrypt_active() functions.
>>
>> It is expected that as new protected virtualization technologies are
>> added to the kernel, they can all be covered by a single function call
>> instead of a collection of specific function calls all called from the
>> same locations.
>>
>> The powerpc and s390 patches have been compile tested only. Can the
>> folks copied on this series verify that nothing breaks for them.
> 
> With this patch set, select ARCH_HAS_PROTECTED_GUEST and set
> CONFIG_AMD_MEM_ENCRYPT=n, creates following error.
> 
> ld: arch/x86/mm/ioremap.o: in function `early_memremap_is_setup_data':
> arch/x86/mm/ioremap.c:672: undefined reference to `early_memremap_decrypted'
> 
> It looks like early_memremap_is_setup_data() is not protected with
> appropriate config.

Ok, thanks for finding that. I'll fix that.

Thanks,
Tom

> 
> 
>>
>> Cc: Andi Kleen 
>> Cc: Andy Lutomirski 
>> Cc: Ard Biesheuvel 
>> Cc: Baoquan He 
>> Cc: Benjamin Herrenschmidt 
>> Cc: Borislav Petkov 
>> Cc: Christian Borntraeger 
>> Cc: Daniel Vetter 
>> Cc: Dave Hansen 
>> Cc: Dave Young 
>> Cc: David Airlie 
>> Cc: Heiko Carstens 
>> Cc: Ingo Molnar 
>> Cc: Joerg Roedel 
>> Cc: Maarten Lankhorst 
>> Cc: Maxime Ripard 
>> Cc: Michael Ellerman 
>> Cc: Paul Mackerras 
>> Cc: Peter Zijlstra 
>> Cc: Thomas Gleixner 
>> Cc: Thomas Zimmermann 
>> Cc: Vasily Gorbik 
>> Cc: VMware Graphics 
>> Cc: Will Deacon 
>>
>> ---
>>
>> Patches based on:
>>   
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftip%2Ftip.git&data=04%7C01%7Cthomas.lendacky%40amd.com%7C563b5e30a3254f6739aa08d95ad6e242%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637640701228434514%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=vx9v4EmYqVTsJ7KSr97gQaBWJ%2Fq%2BE9NOzXMhe3Fp7T8%3D&reserved=0
>> master
>>    commit 79e920060fa7 ("Merge branch 'WIP/fixes'")
>>
>> Tom Lendacky (11):
>>    mm: Introduce a function to check for virtualization protection
>>  features
>>    x86/sev: Add an x86 version of prot_guest_has()
>>    powerpc/pseries/svm: Add a powerpc version of prot_guest_has()
>>    x86/sme: Replace occurrences of sme_active() with prot_guest_has()
>>    x86/sev: Replace occurrences of sev_active() with prot_guest_has()
>>    x86/sev: Replace occurrences of sev_es_active() with prot_guest_has()
>>    treewide: Replace the use of mem_encrypt_active() with
>>  prot_guest_has()
>>    mm: Remove the now unused mem_encrypt_active() function
>>    x86/sev: Remove the now unused mem_encrypt_active() function
>>    powerpc/pseries/svm: Remove the now unused mem_encrypt_active()
>>  function
>>    s390/mm: Remove the now unused mem_encrypt_active() function
>>
>>   arch/Kconfig   |  3 ++
>>   arch/powerpc/include/asm/mem_encrypt.h |  5 --
>>   arch/powerpc/include/asm/protected_guest.h | 30 +++
>>   arch/powerpc/platforms/pseries/Kconfig |  1 +
>>   arch/s390/include/asm/mem_encrypt.h    |  2 -
>>   arch/x86/Kconfig   |  1 +
>>   arch/x86/include/asm/kexec.h   |  2 +-
>>   arch/x86/include/asm/mem_encrypt.h | 13 +
>>   arch/x86/include/asm/protected_guest.h | 27 ++
>>   arch/x86/kernel/crash_dump_64.c    |  4 +-
>>   arch/x86/kernel/head64.c   |  4 +-
>>   arch/x86/kernel/kvm.c  |  3 +-
>>   arch/x86/kernel/kvmclock.c |  4 +-
>>   arch/x86/kernel/machine_kexec_64.c | 19 +++
>>   arch/x86/kernel/pci-swiotlb.c  |  9 ++--
>>   arch/x86/kernel/relocate_kernel_64.S   |  2 +-
>>   arch/x86/kernel/sev.c  |  6 +--
>>   arch/x86/kvm/svm/svm.c |  3 +-
>>   arch/x86/mm/ioremap.c  | 16 +++---
>>   arch/x86/mm/mem_encrypt.c  | 60 +++---
>>   arch/x86/mm/mem_encrypt_identity.c |  3 +-
>>   arch/x86/mm/pat/set_memory.c   |  3 +-
>>   arch/x86/platform/efi/efi_64.c |  9 ++--
>>   arch/x86/realmode/init.c   |  8 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 +-
>>   drivers/gpu/drm/drm_cache.c    |  4 +-
>>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.c    |  4 +-
>>   drivers/gpu/drm/vmwgfx/vmwgfx_msg.c    |  6 +--
>>   drivers/iommu/amd/init.c   |  7 +--
>>   drivers/iommu/amd/iommu.c  |  3 +-
>>   drivers/iommu/amd/iommu_v2.c   |  3 +-
>>   drivers/iommu/iommu.c  |  3 +-
>>   fs/proc/vmcore.c   |  6 +--
>>   include/linux/mem_encrypt.h    |  4 --
>>   incl

Re: [PATCH 06/11] x86/sev: Replace occurrences of sev_es_active() with prot_guest_has()

2021-08-09 Thread Kuppuswamy, Sathyanarayanan




On 8/9/21 2:59 PM, Tom Lendacky wrote:

Not sure how TDX will handle AP booting, are you sure it needs this
special setup as well? Otherwise a check for SEV-ES would be better
instead of the generic PATTR_GUEST_PROT_STATE.

Yes, I'm not sure either. I figure that change can be made, if needed, as
part of the TDX support.


We don't plan to set PROT_STATE. So it does not affect TDX.
For SMP, we use MADT ACPI table for AP booting.

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()

2021-08-09 Thread Tom Lendacky
On 8/2/21 7:42 AM, Christophe Leroy wrote:
> 
> 
> Le 28/07/2021 à 00:26, Tom Lendacky a écrit :
>> Replace occurrences of mem_encrypt_active() with calls to prot_guest_has()
>> with the PATTR_MEM_ENCRYPT attribute.
> 
> 
> What about
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.ozlabs.org%2Fproject%2Flinuxppc-dev%2Fpatch%2F20210730114231.23445-1-will%40kernel.org%2F&data=04%7C01%7Cthomas.lendacky%40amd.com%7C1198d62463e04a27be5908d955b30433%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637635049667233612%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=Erpu4Du05sVYkYuAfTkXdLvq48%2FlfLS2q%2FZW8DG3tFw%3D&reserved=0>
>  ?

Ah, looks like that just went into the PPC tree and isn't part of the tip
tree. I'll have to look into how to handle that one.

Thanks,
Tom

> 
> Christophe
> 
> 
>>
>> Cc: Thomas Gleixner 
>> Cc: Ingo Molnar 
>> Cc: Borislav Petkov 
>> Cc: Dave Hansen 
>> Cc: Andy Lutomirski 
>> Cc: Peter Zijlstra 
>> Cc: David Airlie 
>> Cc: Daniel Vetter 
>> Cc: Maarten Lankhorst 
>> Cc: Maxime Ripard 
>> Cc: Thomas Zimmermann 
>> Cc: VMware Graphics 
>> Cc: Joerg Roedel 
>> Cc: Will Deacon 
>> Cc: Dave Young 
>> Cc: Baoquan He 
>> Signed-off-by: Tom Lendacky 
>> ---
>>   arch/x86/kernel/head64.c    | 4 ++--
>>   arch/x86/mm/ioremap.c   | 4 ++--
>>   arch/x86/mm/mem_encrypt.c   | 5 ++---
>>   arch/x86/mm/pat/set_memory.c    | 3 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 +++-
>>   drivers/gpu/drm/drm_cache.c | 4 ++--
>>   drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
>>   drivers/gpu/drm/vmwgfx/vmwgfx_msg.c | 6 +++---
>>   drivers/iommu/amd/iommu.c   | 3 ++-
>>   drivers/iommu/amd/iommu_v2.c    | 3 ++-
>>   drivers/iommu/iommu.c   | 3 ++-
>>   fs/proc/vmcore.c    | 6 +++---
>>   kernel/dma/swiotlb.c    | 4 ++--
>>   13 files changed, 29 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
>> index de01903c3735..cafed6456d45 100644
>> --- a/arch/x86/kernel/head64.c
>> +++ b/arch/x86/kernel/head64.c
>> @@ -19,7 +19,7 @@
>>   #include 
>>   #include 
>>   #include 
>> -#include 
>> +#include 
>>   #include 
>>     #include 
>> @@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long
>> physaddr,
>>    * there is no need to zero it after changing the memory encryption
>>    * attribute.
>>    */
>> -    if (mem_encrypt_active()) {
>> +    if (prot_guest_has(PATTR_MEM_ENCRYPT)) {
>>   vaddr = (unsigned long)__start_bss_decrypted;
>>   vaddr_end = (unsigned long)__end_bss_decrypted;
>>   for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
>> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
>> index 0f2d5ace5986..5e1c1f5cbbe8 100644
>> --- a/arch/x86/mm/ioremap.c
>> +++ b/arch/x86/mm/ioremap.c
>> @@ -693,7 +693,7 @@ static bool __init
>> early_memremap_is_setup_data(resource_size_t phys_addr,
>>   bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned
>> long size,
>>    unsigned long flags)
>>   {
>> -    if (!mem_encrypt_active())
>> +    if (!prot_guest_has(PATTR_MEM_ENCRYPT))
>>   return true;
>>     if (flags & MEMREMAP_ENC)
>> @@ -723,7 +723,7 @@ pgprot_t __init
>> early_memremap_pgprot_adjust(resource_size_t phys_addr,
>>   {
>>   bool encrypted_prot;
>>   -    if (!mem_encrypt_active())
>> +    if (!prot_guest_has(PATTR_MEM_ENCRYPT))
>>   return prot;
>>     encrypted_prot = true;
>> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
>> index 451de8e84fce..0f1533dbe81c 100644
>> --- a/arch/x86/mm/mem_encrypt.c
>> +++ b/arch/x86/mm/mem_encrypt.c
>> @@ -364,8 +364,7 @@ int __init early_set_memory_encrypted(unsigned long
>> vaddr, unsigned long size)
>>   /*
>>    * SME and SEV are very similar but they are not the same, so there are
>>    * times that the kernel will need to distinguish between SME and SEV.
>> The
>> - * sme_active() and sev_active() functions are used for this.  When a
>> - * distinction isn't needed, the mem_encrypt_active() function can be
>> used.
>> + * sme_active() and sev_active() functions are used for this.
>>    *
>>    * The trampoline code is a good example for this requirement.  Before
>>    * paging is activated, SME will access all memory as decrypted, but SEV
>> @@ -451,7 +450,7 @@ void __init mem_encrypt_free_decrypted_mem(void)
>>    * The unused memory range was mapped decrypted, change the
>> encryption
>>    * attribute from decrypted to encrypted before freeing it.
>>    */
>> -    if (mem_encrypt_active()) {
>> +    if (sme_me_mask) {
>>   r = set_memory_encrypted(vaddr, npages);
>>   if (r) {
>>   pr_warn("failed to free unused decrypted pages\n");
>> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c

Re: [PATCH 06/11] x86/sev: Replace occurrences of sev_es_active() with prot_guest_has()

2021-08-09 Thread Tom Lendacky
On 8/2/21 5:45 AM, Joerg Roedel wrote:
> On Tue, Jul 27, 2021 at 05:26:09PM -0500, Tom Lendacky wrote:
>> @@ -48,7 +47,7 @@ static void sme_sev_setup_real_mode(struct 
>> trampoline_header *th)
>>  if (prot_guest_has(PATTR_HOST_MEM_ENCRYPT))
>>  th->flags |= TH_FLAGS_SME_ACTIVE;
>>  
>> -if (sev_es_active()) {
>> +if (prot_guest_has(PATTR_GUEST_PROT_STATE)) {
>>  /*
>>   * Skip the call to verify_cpu() in secondary_startup_64 as it
>>   * will cause #VC exceptions when the AP can't handle them yet.
> 
> Not sure how TDX will handle AP booting, are you sure it needs this
> special setup as well? Otherwise a check for SEV-ES would be better
> instead of the generic PATTR_GUEST_PROT_STATE.

Yes, I'm not sure either. I figure that change can be made, if needed, as
part of the TDX support.

Thanks,
Tom

> 
> Regards,
> 
> Joerg
> 


Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()

2021-08-09 Thread Tom Lendacky
On 7/30/21 5:34 PM, Sean Christopherson wrote:
> On Tue, Jul 27, 2021, Tom Lendacky wrote:
>> @@ -451,7 +450,7 @@ void __init mem_encrypt_free_decrypted_mem(void)
>>   * The unused memory range was mapped decrypted, change the encryption
>>   * attribute from decrypted to encrypted before freeing it.
>>   */
>> -if (mem_encrypt_active()) {
>> +if (sme_me_mask) {
> 
> Any reason this uses sme_me_mask?  The helper it calls, 
> __set_memory_enc_dec(),
> uses prot_guest_has(PATTR_MEM_ENCRYPT) so I assume it's available?

Probably just a slip on my part. I was debating at one point calling the
helper vs. referencing the variables/functions directly in the
mem_encrypt.c file.

Thanks,
Tom

> 
>>  r = set_memory_encrypted(vaddr, npages);
>>  if (r) {
>>  pr_warn("failed to free unused decrypted pages\n");
> 


[PATCH v2 7/9] usb: phy: fsl-usb: add IRQ check

2021-08-09 Thread Sergey Shtylyov
The driver neglects to check the result of platform_get_irq()'s call and
blithely passes the negative error codes to request_irq() (which takes
*unsigned* IRQ #), causing it to fail with -EINVAL, overriding an original
error code. Stop calling request_irq() with the invalid IRQ #s.

Fixes: 0807c500a1a6 ("USB: add Freescale USB OTG Transceiver driver")
Signed-off-by: Sergey Shtylyov 

---
 drivers/usb/phy/phy-fsl-usb.c |2 ++
 1 file changed, 2 insertions(+)

Index: usb/drivers/usb/phy/phy-fsl-usb.c
===
--- usb.orig/drivers/usb/phy/phy-fsl-usb.c
+++ usb/drivers/usb/phy/phy-fsl-usb.c
@@ -873,6 +873,8 @@ int usb_otg_start(struct platform_device
 
/* request irq */
p_otg->irq = platform_get_irq(pdev, 0);
+   if (p_otg->irq < 0)
+   return p_otg->irq;
status = request_irq(p_otg->irq, fsl_otg_isr,
IRQF_SHARED, driver_name, p_otg);
if (status) {


Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-08-09 Thread Bjorn Helgaas
On Sat, Aug 07, 2021 at 11:26:45AM +0200, Uwe Kleine-König wrote:
> On Fri, Aug 06, 2021 at 04:24:52PM -0500, Bjorn Helgaas wrote:
> > On Fri, Aug 06, 2021 at 08:46:23AM +0200, Uwe Kleine-König wrote:
> > > On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote:
> > 
> > > > I looked at all the bus_type.probe() methods, it looks like pci_dev is
> > > > not the only offender here.  At least the following also have a driver
> > > > pointer in the device struct:
> > > > 
> > > >   parisc_device.driver
> > > >   acpi_device.driver
> > > >   dio_dev.driver
> > > >   hid_device.driver
> > > >   pci_dev.driver
> > > >   pnp_dev.driver
> > > >   rio_dev.driver
> > > >   zorro_dev.driver
> > > 
> > > Right, when I converted zorro_dev it was pointed out that the code was
> > > copied from pci and the latter has the same construct. :-)
> > > See
> > > https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koe...@pengutronix.de
> > > for the patch, I don't find where pci was pointed out, maybe it was on
> > > irc only.
> > 
> > Oh, thanks!  I looked to see if you'd done something similar
> > elsewhere, but I missed this one.
> > 
> > > > Looking through the places that care about pci_dev.driver (the ones
> > > > updated by patch 5/6), many of them are ... a little dubious to begin
> > > > with.  A few need the "struct pci_error_handlers *err_handler"
> > > > pointer, so that's probably legitimate.  But many just need a name,
> > > > and should probably be using dev_driver_string() instead.
> > > 
> > > Yeah, I considered adding a function to get the driver name from a
> > > pci_dev and a function to get the error handlers. Maybe it's an idea to
> > > introduce these two and then use to_pci_driver(pdev->dev.driver) for the
> > > few remaining users? Maybe doing that on top of my current series makes
> > > sense to have a clean switch from pdev->driver to pdev->dev.driver?!
> > 
> > I'd propose using dev_driver_string() for these places:
> > 
> >   eeh_driver_name() (could change callers to use dev_driver_string())
> >   bcma_host_pci_probe()
> >   qm_alloc_uacce()
> >   hns3_get_drvinfo()
> >   prestera_pci_probe()
> >   mlxsw_pci_probe()
> >   nfp_get_drvinfo()
> >   ssb_pcihost_probe()
> 
> So the idea is:
> 
>   PCI: Simplify pci_device_remove()
>   PCI: Drop useless check from pci_device_probe()
>   xen/pci: Drop some checks that are always true
> 
> are kept as is as preparation. (Do you want to take them from this v2,
> or should I include them again in v3?)

Easiest if you include them until we merge the series.

> Then convert the list of functions above to use dev_driver_string() in a
> 4th patch.
> 
> > The use in mpt_device_driver_register() looks unnecessary: it's only
> > to get a struct pci_device_id *, which is passed to ->probe()
> > functions that don't need it.
> 
> This is patch #5.
> 
> > The use in adf_enable_aer() looks wrong: it sets the err_handler
> > pointer in one of the adf_driver structs.  I think those structs
> > should be basically immutable, and the drivers that call
> > adf_enable_aer() from their .probe() methods should set
> > ".err_handler = &adf_err_handler" in their static adf_driver
> > definitions instead.
> 
> I don't understand that one without some research, probably this yields
> at least one patch.

Yeah, it's a little messy because you'd have to make adf_err_handler
non-static and add an extern for it.  Sample below.

> > I think that basically leaves these:
> > 
> >   uncore_pci_probe() # .id_table, custom driver "registration"
> >   match_id() # .id_table, arch/x86/kernel/probe_roms.c
> >   xhci_pci_quirks()  # .id_table
> >   pci_error_handlers()   # roll-your-own AER handling, 
> > drivers/misc/cxl/guest.c
> > 
> > I think it would be fine to use to_pci_driver(pdev->dev.driver) for
> > these few.
> 
> Converting these will be patch 7 then and patch 8 can then drop the
> duplicated handling.
> 
> Sounds reasonable?

Sounds good to me.  Thanks for working on this!

Bjorn


diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c 
b/drivers/crypto/qat/qat_4xxx/adf_drv.c
index a8805c815d16..75e6c5540523 100644
--- a/drivers/crypto/qat/qat_4xxx/adf_drv.c
+++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c
@@ -310,6 +310,7 @@ static struct pci_driver adf_driver = {
.probe = adf_probe,
.remove = adf_remove,
.sriov_configure = adf_sriov_configure,
+   .err_handler = adf_err_handler,
 };
 
 module_pci_driver(adf_driver);
diff --git a/drivers/crypto/qat/qat_common/adf_aer.c 
b/drivers/crypto/qat/qat_common/adf_aer.c
index d2ae293d0df6..701c3c5f8b9b 100644
--- a/drivers/crypto/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/qat/qat_common/adf_aer.c
@@ -166,7 +166,7 @@ static void adf_resume(struct pci_dev *pdev)
dev_info(&pdev->dev, "Device is up and running\n");
 }
 
-static const struct pci_error_handlers adf_err_handler = {
+const struct pci_error_handlers adf_err_handler = {
.error_detected = adf_error_de

Re: [PATCH v7 0/6] Add support for FORM2 associativity

2021-08-09 Thread Daniel Henrique Barboza




On 8/9/21 2:24 AM, Aneesh Kumar K.V wrote:

Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. More details can be found in patch 7.

$ numactl -H
...
node distances:
node   0   1   2   3
   0:  10  11  222  33
   1:  44  10  55  66
   2:  77  88  10  99
   3:  101  121  132  10
$

After DAX kmem memory add
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4
   0:  10  11  222  33  240
   1:  44  10  55  66  255
   2:  77  88  10  99  255
   3:  101  121  132  10  230
   4:  255  255  255  230  10


PAPR SCM now use the numa distance details to find the numa_node and target_node
for the device.

kvaneesh@ubuntu-guest:~$ ndctl  list -N -v
[
   {
 "dev":"namespace0.0",
 "mode":"devdax",
 "map":"dev",
 "size":1071644672,
 "uuid":"d333d867-3f57-44c8-b386-d4d3abdc2bf2",
 "raw_uuid":"915361ad-fe6a-42dd-848f-d6dc9f5af362",
 "daxregion":{
   "id":0,
   "size":1071644672,
   "devices":[
 {
   "chardev":"dax0.0",
   "size":1071644672,
   "target_node":4,
   "mode":"devdax"
 }
   ]
 },
 "align":2097152,
 "numa_node":3
   }
]
kvaneesh@ubuntu-guest:~$


The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=222 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=240 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=230 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=230 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/



Series tested with the forementioned QEMU build. All patches:


Tested-by: Daniel Henrique Barboza 



Changes from v6:
* Address review feedback

Changes from v5:
* Fix build error reported by kernel test robot
* Address review feedback

Changes from v4:
* Drop DLPAR related device tree property for now because both Qemu nor PowerVM
   will provide the distance details of all possible NUMA nodes during boot.
* Rework numa distance code based on review feedback.

Changes from v3:
* Drop PAPR SCM specific changes and depend completely on NUMA distance 
information.

Changes from v2:
* Add nvdimm list to Cc:
* update PATCH 8 commit message.

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index


Aneesh Kumar K.V (6):
   powerpc/pseries: rename min_common_depth to primary_domain_index
   powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
   powerpc/pseries: Consolidate different NUMA distance update code paths
   powerpc/pseries: Add a helper for form1 cpu distance
   powerpc/pseries: Add support for FORM2 associativity
   powerpc/pseries: Consolidate form1 distance initialization into a
 helper

  Documentation/powerpc/associativity.rst   | 103 +
  arch/powerpc/include/asm/firmware.h   |   7 +-
  arch/powerpc/include/asm/prom.h   |   3 +-
  arch/powerpc/include/asm/topology.h   |   6 +-
  arch/powerpc/kernel/prom_init.c   |   3 +-
  arch/powerpc/mm/numa.c| 433 ++
  arch/powerpc/platforms/pseries/firmware.c |   3 +-
  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
  .../platforms/pseries/hotplug-memory.c|   2 +
  arch/powerpc/platforms/pseries/lpar.c |   4 +-
  10 files changed, 455 insertions(+), 111 deletions(-)
  create mode 100644 Documentation/powerpc/associativity.rst



Re: [PATCH v3 00/21] .map_sg() error cleanup

2021-08-09 Thread Christoph Hellwig
Thanks,

I've applied this to the dma-mapping tree with a few minor cosmetic
tweaks.


Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes

2021-08-09 Thread Valentin Schneider
On 09/08/21 12:22, Srikar Dronamraju wrote:
> * Valentin Schneider  [2021-08-08 16:56:47]:
>> Wait, doesn't the distance matrix (without any offline node) say
>>
>>   distance(0, 3) == 40
>>
>> ? We should have at the very least:
>>
>>   node   0   1   2   3
>> 0:  10  20  ??  40
>> 1:  20  20  ??  40
>> 2:  ??  ??  ??  ??
>> 3:  40  40  ??  10
>>
>
> Before onlining node 3 and CPU 3 (node/CPU 0 and 1 are already online)
> Note: Node 2-7 and CPU 2-7 are still offline.
>
> node   0   1   2   3
>   0:  10  20  40  10
>   1:  20  20  40  10
>   2:  40  40  10  10
>   3:  10  10  10  10
>
> NODE->mask(0) == 0
> NODE->mask(1) == 1
> NODE->mask(2) == 0
> NODE->mask(3) == 0
>
> Note: This is with updating Node 2's distance as 40 for figuring out
> the number of numa levels. Since we have all possible distances, we
> dont update Node 3 distance, so it will be as if its local to node 0.
>
> Now when Node 3 and CPU 3 are onlined
> Note: Node 2, 3-7 and CPU 2, 3-7 are still offline.
>
> node   0   1   2   3
>   0:  10  20  40  40
>   1:  20  20  40  40
>   2:  40  40  10  40
>   3:  40  40  40  10
>
> NODE->mask(0) == 0
> NODE->mask(1) == 1
> NODE->mask(2) == 0
> NODE->mask(3) == 0,3
>
> CPU 0 continues to be part of Node->mask(3) because when we online and
> we find the right distance, there is no API to reset the numa mask of
> 3 to remove CPU 0 from the numa masks.
>
> If we had an API to clear/set sched_domains_numa_masks[node][] when
> the node state changes, we could probably plug-in to clear/set the
> node masks whenever node state changes.
>

Gotcha, this is now coming back to me...

[...]

>> Ok, so it looks like we really can't do without that part - even if we get
>> "sensible" distance values for the online nodes, we can't divine values for
>> the offline ones.
>>
>
> Yes
>

Argh, while your approach does take care of the masks, it leaves
sched_numa_topology_type unchanged. You *can* force an update of it, but
yuck :(

I got to the below...

---
From: Srikar Dronamraju 
Date: Thu, 1 Jul 2021 09:45:51 +0530
Subject: [PATCH 1/1] sched/topology: Skip updating masks for non-online nodes

The scheduler currently expects NUMA node distances to be stable from init
onwards, and as a consequence builds the related data structures
once-and-for-all at init (see sched_init_numa()).

Unfortunately, on some architectures node distance is unreliable for
offline nodes and may very well change upon onlining.

Skip over offline nodes during sched_init_numa(). Track nodes that have
been onlined at least once, and trigger a build of a node's NUMA masks when
it is first onlined post-init.

Reported-by: Geetika Moolchandani 
Signed-off-by: Srikar Dronamraju 
Signed-off-by: Valentin Schneider 
---
 kernel/sched/topology.c | 65 +
 1 file changed, 65 insertions(+)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b77ad49dc14f..cba95793a9b7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1482,6 +1482,8 @@ int   sched_max_numa_distance;
 static int *sched_domains_numa_distance;
 static struct cpumask  ***sched_domains_numa_masks;
 int __read_mostly  node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
 #endif
 
 /*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
 
for_each_node(k) {
+   /*
+* Distance information can be unreliable for
+* offline nodes, defer building the node
+* masks to its bringup.
+* This relies on all unique distance values
+* still being visible at init time.
+*/
+   if (!node_online(j))
+   continue;
+
if (sched_debug() && (node_distance(j, k) != 
node_distance(k, j)))
sched_numa_warn("Node-distance not 
symmetric");
 
@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
 
init_numa_topology_type();
+
+   sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+   if (!sched_numa_onlined_nodes)
+   return;
+
+   bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+   for_each_online_node(i)
+   bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+void __sched_domains_numa_masks_set(unsigned int node)
+{
+   int i, j;
+
+   /*
+* NUMA masks are not built for offline nodes in sched_init_numa().
+* Thus, when a CPU of a never-onlined-before node gets plugged in,
+* adding that new CPU to the right NU