date:20210617

[PATCH v4 3/7] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-06-17 Thread Aneesh Kumar K.V

Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   |  4 +--
 arch/powerpc/include/asm/prom.h   |  2 +-
 arch/powerpc/kernel/prom_init.c   |  2 +-
 arch/powerpc/mm/numa.c| 35 ++-
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPALASM_CONST(0x1000)
 #define FW_FEATURE_SET_MODEASM_CONST(0x4000)
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
 #define FW_FEATURE_DRMEM_V2ASM_CONST(0x0004)
 #define FW_FEATURE_DRC_INFOASM_CONST(0x0008)
@@ -69,7 +69,7 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-   FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+   FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI0x0201  /* PCIe/MSI support */
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580  /* FORM1 NUMA affinity */
 #define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_HP_EVT 0x0604  /* Hot Plug Event support */
 #define OV5_RESIZE_HPT 0x0601  /* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 41ed7e33d897..64b9593038a7 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec 
ibm_architecture_vec_template __initconst = {
 #else
0,
 #endif
-   .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
+   .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 132813dd1a6c..0ec16999beef 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_associativity_domain_index;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
 
-   if (!form1_affinity)
+   if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
int i;
 
-   if (!form1_affinity)
+   if (affinity_form != FORM1_AFFINITY)
return;
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
int index;
struct device_node *root;
 
+   /*
+* Check for which form of affinity.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL)) {
+   affinity_form = FORM1_AFFINITY;
+   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
+   affinity_form = FORM1_AFFINITY;
+   } else
+   affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -318,23

[PATCH v4 1/7] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
if (!numa_enabled)
goto out;
 
-   if (of_read_number(associativity, 1) >= min_common_depth)
-   nid = of_read_number([min_common_depth], 1);
+   if (of_read_number(associativity, 1) >= primary_domain_index)
+   nid = of_read_number([primary_domain_index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-   int depth;
+   int index;
struct device_node *root;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
}
 
if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
 
-   depth = of_read_number(_ref_points[1], 1);
+   index = of_read_number(_ref_points[1], 1);
}
 
/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
 
of_node_put(root);
-   return depth;
+   return index;
 
 err:
of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if ((min_common_depth < 0) || !numa_enabled)
+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth <= aa.array_sz &&
+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number([index], 1);
 
if (nid == 0x || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
 
-   min_common_depth = find_min_common_depth();
+   primary_domain_index = find_primary_domain_index();
 
-   if (min_common_depth < 0) {
+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
 
-   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
 
/*
 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
 
-   max_nodes = of_read_number([min_common_depth], 1);
+   max_nodes = of_read_number([primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
prop_length /= sizeof(int);
-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
 
index = of_read_number(associativity, 1);
-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number([index - 1], 1);
 
 out:
-- 
2.31.1

[PATCH v4 2/7] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..132813dd1a6c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+static int max_associativity_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 
int i, index;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
index = be32_to_cpu(distance_ref_points[i]);
if (cpu1_assoc[index] == cpu2_assoc[index])
break;
@@ -193,7 +193,7 @@ int __node_distance(int a, int b)
if (!form1_affinity)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
break;
 
@@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
if (!form1_affinity)
return;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = [be32_to_cpu(distance_ref_points[i]) - 1];
@@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
nid = NUMA_NO_NODE;
 
if (nid > 0 &&
-   of_read_number(associativity, 1) >= distance_ref_points_depth) {
+   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
/*
 * Skip the length field and send start of associativity array
 */
@@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
 */
distance_ref_points = of_get_property(root,
"ibm,associativity-reference-points",
-   _ref_points_depth);
+   _associativity_domain_index);
 
if (!distance_ref_points) {
dbg("NUMA: ibm,associativity-reference-points not found.\n");
goto err;
}
 
-   distance_ref_points_depth /= sizeof(int);
+   max_associativity_domain_index /= sizeof(int);
 
if (firmware_has_feature(FW_FEATURE_OPAL) ||
firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
@@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
if (form1_affinity) {
index = of_read_number(distance_ref_points, 1);
} else {
-   if (distance_ref_points_depth < 2) {
+   if (max_associativity_domain_index < 2) {
printk(KERN_WARNING "NUMA: "
"short ibm,associativity-reference-points\n");
goto err;
@@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
 * Warn and cap if the hardware supports more than
 * MAX_DISTANCE_REF_POINTS domains.
 */
-   if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+   if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
printk(KERN_WARNING "NUMA: distance array capped at "
"%d entries\n", MAX_DISTANCE_REF_POINTS);
-   distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+   max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
}
 
of_node_put(root);
-- 
2.31.1

[PATCH v4 0/7] Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V

Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. More details can be found in patch 7.

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX kmem memory add
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  222  33  240 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  230 
  4:  255  255  255  230  10 


PAPR SCM now use the numa distance details to find the numa_node and target_node
for the device.

kvaneesh@ubuntu-guest:~$ ndctl  list -N -v 
[
  {
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":1071644672,
"uuid":"d333d867-3f57-44c8-b386-d4d3abdc2bf2",
"raw_uuid":"915361ad-fe6a-42dd-848f-d6dc9f5af362",
"daxregion":{
  "id":0,
  "size":1071644672,
  "devices":[
{
  "chardev":"dax0.0",
  "size":1071644672,
  "target_node":4,
  "mode":"devdax"
}
  ]
},
"align":2097152,
"numa_node":3
  }
]
kvaneesh@ubuntu-guest:~$ 


The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=222 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=240 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=230 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=230 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/

Changes from v3:
* Drop PAPR SCM specific changes and depend completely on NUMA distance 
information.

Changes from v2:
* Add nvdimm list to Cc:
* update PATCH 8 commit message.

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index

Aneesh Kumar K.V (7):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: rename distance_ref_points_depth to
max_associativity_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate DLPAR NUMA distance update
  powerpc/pseries: Consolidate NUMA distance update during boot
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity

 Documentation/powerpc/associativity.rst   | 135 ++
 arch/powerpc/include/asm/firmware.h   |   7 +-
 arch/powerpc/include/asm/prom.h   |   3 +-
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 410 ++
 arch/powerpc/platforms/pseries/firmware.c |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/pseries.h  |   1 +
 9 files changed, 474 insertions(+), 92 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1

Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-17 Thread Christophe Leroy





Le 17/06/2021 à 08:36, Athira Rajeev a écrit :




On 16-Jun-2021, at 11:56 AM, Christophe Leroy  
wrote:



Le 16/06/2021 à 05:40, Athira Rajeev a écrit :

On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan  wrote:


On 6/15/21 8:35 PM, Christophe Leroy wrote:
For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 
5.12 and 5.11.

Runs ok on 5.10. I'm starting bisecting now.



Thanks for reporting, got the issue. What has happened in this case is that, pmu device is not 
registered
and trying to access the instruction point which will land in perf_instruction_pointer(). And 
recently I have added
a workaround patch for power10 DD1 which has caused this breakage. My bad. We are working on a 
fix patch

for the same and will post it out. Sorry again.


Hi Christophe,
Can you please try with below patch in your environment and test if it works 
for you.
From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001
From: Athira Rajeev 
Date: Tue, 15 Jun 2021 22:28:11 -0400
Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when
pmu is not set
On systems without any specific PMU driver support registered, running
perf record causes oops:
[   38.841073] NIP [c013af54] perf_instruction_pointer+0x24/0x100
[   38.841079] LR [c03c7358] perf_prepare_sample+0x4e8/0x820
[   38.841085] --- interrupt: 300
[   38.841088] [c0001cf03440] [c03c6ef8] 
perf_prepare_sample+0x88/0x820 (unreliable)
[   38.841096] [c0001cf034a0] [c03c76d0] 
perf_event_output_forward+0x40/0xc0
[   38.841104] [c0001cf03520] [c03b45e8] 
__perf_event_overflow+0x88/0x1b0
[   38.841112] [c0001cf03570] [c03b480c] 
perf_swevent_hrtimer+0xfc/0x1a0
[   38.841119] [c0001cf03740] [c02399cc] 
__hrtimer_run_queues+0x17c/0x380
[   38.841127] [c0001cf037c0] [c023a5f8] 
hrtimer_interrupt+0x128/0x2f0
[   38.841135] [c0001cf03870] [c002962c] timer_interrupt+0x13c/0x370
[   38.841143i] [c0001cf038d0] [c0009ba4] 
decrementer_common_virt+0x1a4/0x1b0
[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0
During perf record session, perf_instruction_pointer() is called to
capture the sample ip. This function in core-book3s accesses ppmu->flags.
If a platform specific PMU driver is not registered, ppmu is set to NULL
and accessing its members results in a crash. Fix this crash by checking
if ppmu is set.
Signed-off-by: Athira Rajeev 
Reported-by: Christophe Leroy 


Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")
Cc: sta...@vger.kernel.org
Tested-by: Christophe Leroy 


Hi Christophe,

Thanks for testing with the change. I have a newer version where I have added 
braces around the check.
Can you please check once and can I add your tested-by for the below patch.


Yes it works, you can add my Tested-by:
Please also add Cc: sta...@vger.kernel.org, this needs to be backported as soon 
as possible.

Thanks
Christophe

[PATCH v4 17/17] powerpc/interrupt: Remove prep_irq_for_user_exit()

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

prep_irq_for_user_exit() has only one caller, squash it
inside that caller.

Signed-off-by: Christophe Leroy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 138c450b01bd..09b8304a7011 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -75,18 +75,6 @@ static notrace __always_inline bool 
prep_irq_for_enabled_exit(bool restartable)
return true;
 }
 
-static notrace __always_inline bool prep_irq_for_user_exit(void)
-{
-   bool ret;
-
-   user_enter_irqoff();
-   ret = prep_irq_for_enabled_exit(true);
-   if (!ret)
-   user_exit_irqoff();
-
-   return ret;
-}
-
 /* Has to run notrace because it is entered not completely "reconciled" */
 notrace long system_call_exception(long r3, long r4, long r5,
   long r6, long r7, long r8,
@@ -276,7 +264,9 @@ interrupt_exit_user_prepare_main(unsigned long ret, struct 
pt_regs *regs)
}
}
 
-   if (!prep_irq_for_user_exit()) {
+   user_enter_irqoff();
+   if (!prep_irq_for_enabled_exit(true)) {
+   user_exit_irqoff();
local_irq_enable();
local_irq_disable();
goto again;
-- 
2.23.0

[PATCH v4 16/17] powerpc/interrupt: Refactor prep_irq_for_{user/kernel_enabled}_exit()

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

prep_irq_for_user_exit() is a superset of
prep_irq_for_kernel_enabled_exit().

Rename prep_irq_for_kernel_enabled_exit() as prep_irq_for_enabled_exit()
and have prep_irq_for_user_exit() use it.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 29 +++--
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 6ba693f99e5a..138c450b01bd 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -50,7 +50,7 @@ static inline bool exit_must_hard_disable(void)
  * restartable is true then EE/RI can be left on because interrupts are handled
  * with a restart sequence.
  */
-static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
+static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable)
 {
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
@@ -77,29 +77,14 @@ static notrace __always_inline bool 
prep_irq_for_kernel_enabled_exit(bool restar
 
 static notrace __always_inline bool prep_irq_for_user_exit(void)
 {
-   user_enter_irqoff();
-   /* This must be done with RI=1 because tracing may touch vmaps */
-   trace_hardirqs_on();
-
-#ifdef CONFIG_PPC32
-   __hard_EE_RI_disable();
-#else
-   if (exit_must_hard_disable())
-   __hard_EE_RI_disable();
+   bool ret;
 
-   /* This pattern matches prep_irq_for_idle */
-   if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable()) {
-   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
-   __hard_RI_enable();
-   }
-   trace_hardirqs_off();
+   user_enter_irqoff();
+   ret = prep_irq_for_enabled_exit(true);
+   if (!ret)
user_exit_irqoff();
 
-   return false;
-   }
-#endif
-   return true;
+   return ret;
 }
 
 /* Has to run notrace because it is entered not completely "reconciled" */
@@ -469,7 +454,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct 
pt_regs *regs)
 * Stack store exit can't be restarted because the interrupt
 * stack frame might have been clobbered.
 */
-   if (!prep_irq_for_kernel_enabled_exit(unlikely(stack_store))) {
+   if (!prep_irq_for_enabled_exit(unlikely(stack_store))) {
/*
 * Replay pending soft-masked interrupts now. Don't
 * just local_irq_enabe(); local_irq_disable(); because
-- 
2.23.0

[PATCH v4 15/17] powerpc/interrupt: Interchange prep_irq_for_{kernel_enabled/user}_exit()

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

prep_irq_for_user_exit() is a superset of
prep_irq_for_kernel_enabled_exit(). In order to allow refactoring in
following patch, interchange the two. This will allow
prep_irq_for_user_exit() to call a renamed version of
prep_irq_for_kernel_enabled_exit().

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index e946084d4906..6ba693f99e5a 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -46,27 +46,28 @@ static inline bool exit_must_hard_disable(void)
  * This should be called with local irqs disabled, but if they were previously
  * enabled when the interrupt handler returns (indicating a process-context /
  * synchronous interrupt) then irqs_enabled should be true.
+ *
+ * restartable is true then EE/RI can be left on because interrupts are handled
+ * with a restart sequence.
  */
-static notrace __always_inline bool prep_irq_for_user_exit(void)
+static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
 {
-   user_enter_irqoff();
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
 #ifdef CONFIG_PPC32
__hard_EE_RI_disable();
 #else
-   if (exit_must_hard_disable())
+   if (exit_must_hard_disable() || !restartable)
__hard_EE_RI_disable();
 
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable()) {
+   if (exit_must_hard_disable() || !restartable) {
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
__hard_RI_enable();
}
trace_hardirqs_off();
-   user_exit_irqoff();
 
return false;
}
@@ -74,28 +75,26 @@ static notrace __always_inline bool 
prep_irq_for_user_exit(void)
return true;
 }
 
-/*
- * restartable is true then EE/RI can be left on because interrupts are handled
- * with a restart sequence.
- */
-static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
+static notrace __always_inline bool prep_irq_for_user_exit(void)
 {
+   user_enter_irqoff();
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
 #ifdef CONFIG_PPC32
__hard_EE_RI_disable();
 #else
-   if (exit_must_hard_disable() || !restartable)
+   if (exit_must_hard_disable())
__hard_EE_RI_disable();
 
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable() || !restartable) {
+   if (exit_must_hard_disable()) {
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
__hard_RI_enable();
}
trace_hardirqs_off();
+   user_exit_irqoff();
 
return false;
}
-- 
2.23.0

[PATCH v4 14/17] powerpc/interrupt: Refactor interrupt_exit_user_prepare()

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

interrupt_exit_user_prepare() is a superset of
interrupt_exit_user_prepare_main().

Refactor to avoid code duplication.

Signed-off-by: Christophe Leroy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 57 ++---
 1 file changed, 3 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 5bfdf8f9b130..e946084d4906 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -404,9 +404,7 @@ notrace unsigned long syscall_exit_restart(unsigned long 
r3, struct pt_regs *reg
 
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
 {
-   unsigned long ti_flags;
-   unsigned long flags;
-   unsigned long ret = 0;
+   unsigned long ret;
 
if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
BUG_ON(!(regs->msr & MSR_RI));
@@ -420,63 +418,14 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs)
 */
kuap_assert_locked();
 
-   local_irq_save(flags);
-
-again:
-   ti_flags = READ_ONCE(current_thread_info()->flags);
-   while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
-   local_irq_enable(); /* returning to user: may enable */
-   if (ti_flags & _TIF_NEED_RESCHED) {
-   schedule();
-   } else {
-   if (ti_flags & _TIF_SIGPENDING)
-   ret |= _TIF_RESTOREALL;
-   do_notify_resume(regs, ti_flags);
-   }
-   local_irq_disable();
-   ti_flags = READ_ONCE(current_thread_info()->flags);
-   }
-
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
-   if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
-   unlikely((ti_flags & _TIF_RESTORE_TM))) {
-   restore_tm_state(regs);
-   } else {
-   unsigned long mathflags = MSR_FP;
-
-   if (cpu_has_feature(CPU_FTR_VSX))
-   mathflags |= MSR_VEC | MSR_VSX;
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   mathflags |= MSR_VEC;
-
-   /* See above restore_math comment */
-   if ((regs->msr & mathflags) != mathflags)
-   restore_math(regs);
-   }
-   }
-
-   if (!prep_irq_for_user_exit()) {
-   local_irq_enable();
-   local_irq_disable();
-   goto again;
-   }
-
-   booke_load_dbcr0();
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   local_paca->tm_scratch = regs->msr;
-#endif
+   local_irq_disable();
 
-   account_cpu_user_exit();
+   ret = interrupt_exit_user_prepare_main(0, regs);
 
 #ifdef CONFIG_PPC64
regs->exit_result = ret;
 #endif
 
-   /* Restore user access locks last */
-   kuap_user_restore(regs);
-   kuep_unlock();
-
return ret;
 }
 
-- 
2.23.0

[PATCH v4 13/17] powerpc/interrupt: Rename and lightly change syscall_exit_prepare_main()

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

Rename syscall_exit_prepare_main() into interrupt_exit_prepare_main()

Pass it the 'ret' so that it can 'or' it directly instead of
oring twice, once inside the function and once outside.

And remove 'r3' parameter which is not used.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
[np: split out some changes into other patches]
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 6dc9b7536511..5bfdf8f9b130 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -243,11 +243,10 @@ static notrace void booke_load_dbcr0(void)
 #endif
 }
 
-static notrace unsigned long syscall_exit_prepare_main(unsigned long r3,
-  struct pt_regs *regs)
+static notrace unsigned long
+interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs)
 {
unsigned long ti_flags;
-   unsigned long ret = 0;
 
 again:
ti_flags = READ_ONCE(current_thread_info()->flags);
@@ -365,7 +364,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
}
 
local_irq_disable();
-   ret |= syscall_exit_prepare_main(r3, regs);
+   ret = interrupt_exit_user_prepare_main(ret, regs);
 
 #ifdef CONFIG_PPC64
regs->exit_result = ret;
@@ -397,7 +396,7 @@ notrace unsigned long syscall_exit_restart(unsigned long 
r3, struct pt_regs *reg
 
BUG_ON(!user_mode(regs));
 
-   regs->exit_result |= syscall_exit_prepare_main(r3, regs);
+   regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, 
regs);
 
return regs->exit_result;
 }
-- 
2.23.0

[PATCH v4 12/17] powerpc/64: use interrupt restart table to speed up return from interrupt

2021-06-17 Thread Nicholas Piggin

Use the restart table facility to return from interrupt or system calls
without disabling MSR[EE] or MSR[RI].

Interrupt return asm is put into the low soft-masked region, to prevent
interrupts being processed here, although they are still taken as masked
interrupts which causes SRRs to be clobbered, and a pending soft-masked
interrupt to require replaying.

The return code uses restart table regions to redirct to a fixup handler
rather than continue with the exit, if such an interrupt happens. In
this case the interrupt return is redirected to a fixup handler which
reloads r1 for the interrupt stack and reloads registers and sets state
up to replay the soft-masked interrupt and try the exit again.

Some types of security exit fallback flushes and barriers are currently
unable to cope with reentrant interrupts, e.g., because they store some
state in the scratch SPR which would be clobbered even by masked
interrupts. For now the interrupts-enabled exits are disabled when these
flushes are used.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h |   5 +
 arch/powerpc/include/asm/hw_irq.h |  13 +-
 arch/powerpc/include/asm/interrupt.h  |   2 +
 arch/powerpc/include/asm/paca.h   |   3 +
 arch/powerpc/include/asm/ptrace.h |   1 +
 arch/powerpc/kernel/asm-offsets.c |   3 +
 arch/powerpc/kernel/interrupt.c   | 393 ++
 arch/powerpc/kernel/interrupt_64.S| 143 ++--
 arch/powerpc/lib/feature-fixups.c |  52 ++-
 9 files changed, 457 insertions(+), 158 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 95492655462e..89765b2b3be0 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -73,6 +73,11 @@ long system_call_exception(long r3, long r4, long r5, long 
r6, long r7, long r8,
 notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs 
*regs, long scv);
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs);
 notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs);
+#ifdef CONFIG_PPC64
+unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs);
+unsigned long interrupt_exit_user_restart(struct pt_regs *regs);
+unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs);
+#endif
 
 long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
  u32 len_high, u32 len_low);
diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 19bcef666cf6..21cc571ea9c2 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -18,8 +18,17 @@
  * PACA flags in paca->irq_happened.
  *
  * This bits are set when interrupts occur while soft-disabled
- * and allow a proper replay. Additionally, PACA_IRQ_HARD_DIS
- * is set whenever we manually hard disable.
+ * and allow a proper replay.
+ *
+ * The PACA_IRQ_HARD_DIS is set whenever we hard disable. It is almost
+ * always in synch with the MSR[EE] state, except:
+ * - A window in interrupt entry, where hardware disables MSR[EE] and that
+ *   must be "reconciled" with the soft mask state.
+ * - NMI interrupts that hit in awkward places, until they fix the state.
+ * - When local irqs are being enabled and state is being fixed up.
+ * - When returning from an interrupt there are some windows where this
+ *   can become out of synch, but gets fixed before the RFI or before
+ *   executing the next user instruction (see arch/powerpc/kernel/interrupt.c).
  */
 #define PACA_IRQ_HARD_DIS  0x01
 #define PACA_IRQ_DBELL 0x02
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 88043e46442a..c7b4adbe2dfe 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -79,6 +79,8 @@ unsigned long search_kernel_restart_table(unsigned long addr);
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
+DECLARE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
+
 static inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dfc984b0e640..ca0b52e107a8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -167,6 +167,9 @@ struct paca_struct {
u64 kstack; /* Saved Kernel stack addr */
u64 saved_r1;   /* r1 save for RTAS calls or PM or EE=0 
*/
u64 saved_msr;  /* MSR saved here by enter_rtas */
+#ifdef CONFIG_PPC64
+   u64 exit_save_r1;   /* Syscall/interrupt R1 save */
+#endif
 #ifdef CONFIG_PPC_BOOK3E
u16 trap_save;  /* Used when bad stack is encountered */
 #endif
diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 516117bba4e6..cb2b093dee13 100644

[PATCH v4 11/17] powerpc/64: treat low kernel text as irqs soft-masked

2021-06-17 Thread Nicholas Piggin

Treat code below __end_soft_masked as soft-masked for the purpose
of alternate return. 64s already mostly does this for scv entry.

This will be used to exit from interrupts without disabling MSR[EE].

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 11 ---
 arch/powerpc/kernel/exceptions-64e.S | 12 +++-
 arch/powerpc/kernel/exceptions-64s.S | 19 +++
 arch/powerpc/kernel/interrupt_64.S   |  6 +-
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 49d9a6fd1bb9..88043e46442a 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -158,8 +158,13 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
 * CT_WARN_ON comes here via program_check_exception,
 * so avoid recursion.
 */
-   if (TRAP(regs) != INTERRUPT_PROGRAM)
+   if (TRAP(regs) != INTERRUPT_PROGRAM) {
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+   BUG_ON(regs->nip < (unsigned long)__end_soft_masked);
+   }
+   /* Move this under a debugging check */
+   if (arch_irq_disabled_regs(regs))
+   BUG_ON(search_kernel_restart_table(regs->nip));
}
 #endif
 
@@ -254,8 +259,8 @@ static inline void interrupt_nmi_enter_prepare(struct 
pt_regs *regs, struct inte
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 
if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !(regs->msr & MSR_PR) &&
-   regs->nip < (unsigned long)__end_interrupts) {
-   // Kernel code running below __end_interrupts is
+   regs->nip < (unsigned long)__end_soft_masked) {
+   // Kernel code running below __end_soft_masked is
// implicitly soft-masked.
regs->softe = IRQS_ALL_DISABLED;
}
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 1b79f8a75298..22fcd95dd8dc 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -342,7 +342,17 @@ ret_from_mc_except:
 #define PROLOG_ADDITION_MASKABLE_GEN(n)
\
lbz r10,PACAIRQSOFTMASK(r13);   /* are irqs soft-masked? */ \
andi.   r10,r10,IRQS_DISABLED;  /* yes -> go out of line */ \
-   bne masked_interrupt_book3e_##n
+   bne masked_interrupt_book3e_##n;\
+   /* Kernel code below __end_soft_masked is implicitly masked */  \
+   andi.   r10,r11,MSR_PR; \
+   bne 1f; /* user -> not masked */\
+   std r14,PACA_EXGEN+EX_R14(r13); \
+   LOAD_REG_IMMEDIATE_SYM(r14, r10, __end_soft_masked);\
+   mfspr   r10,SPRN_SRR0;  \
+   cmpld   r10,r14;\
+   ld  r14,PACA_EXGEN+EX_R14(r13); \
+   blt masked_interrupt_book3e_##n;\
+1:
 
 /*
  * Additional regs must be re-loaded from paca before EXCEPTION_COMMON* is
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 17a213f25c92..2d980addc88c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -508,10 +508,13 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
andi.   r10,r12,MSR_PR
bne 2f
 
-   /* Kernel code running below __end_interrupts is implicitly
-* soft-masked */
-   LOAD_HANDLER(r10, __end_interrupts)
+   /*
+* Kernel code running below __end_soft_masked is implicitly
+* soft-masked
+*/
+   LOAD_HANDLER(r10, __end_soft_masked)
cmpld   r11,r10
+
li  r10,IMASK
blt-1f
 
@@ -824,17 +827,17 @@ __start_interrupts:
  * scv instructions enter the kernel without changing EE, RI, ME, or HV.
  * In particular, this means we can take a maskable interrupt at any point
  * in the scv handler, which is unlike any other interrupt. This is solved
- * by treating the instruction addresses below __end_interrupts as being
+ * by treating the instruction addresses below __end_soft_masked as being
  * soft-masked.
  *
  * AIL-0 mode scv exceptions go to 0x17000-0x17fff, but we set AIL-3 and
  * ensure scv is never executed with relocation off, which means AIL-0
  * should never happen.
  *
- * Before leaving the below __end_interrupts text, at least of the following
- * must be true:
+ * Before leaving the

[PATCH v4 10/17] powerpc/64: interrupt soft-enable race fix

2021-06-17 Thread Nicholas Piggin

Prevent interrupt restore from allowing racing hard interrupts going
ahead of previous soft-pending ones, by using the soft-masked restart
handler to allow a store to clear the soft-mask while knowing nothing
is soft-pending.

This probably doesn't matter much in practice, but it's a simple
demonstrator / test case to exercise the restart table logic.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 95 +++
 1 file changed, 95 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 72cb45393ef2..8428caf3194e 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -217,6 +217,100 @@ static inline void replay_soft_interrupts_irqrestore(void)
 #define replay_soft_interrupts_irqrestore() replay_soft_interrupts()
 #endif
 
+#ifdef CONFIG_CC_HAS_ASM_GOTO
+notrace void arch_local_irq_restore(unsigned long mask)
+{
+   unsigned char irq_happened;
+
+   /* Write the new soft-enabled value if it is a disable */
+   if (mask) {
+   irq_soft_mask_set(mask);
+   return;
+   }
+
+   /*
+* After the stb, interrupts are unmasked and there are no interrupts
+* pending replay. The restart sequence makes this atomic with
+* respect to soft-masked interrupts. If this was just a simple code
+* sequence, a soft-masked interrupt could become pending right after
+* the comparison and before the stb.
+*
+* This allows interrupts to be unmasked without hard disabling, and
+* also without new hard interrupts coming in ahead of pending ones.
+*/
+   asm_volatile_goto(
+"1:\n"
+"  lbz 9,%0(13)\n"
+"  cmpwi   9,0 \n"
+"  bne %l[happened]\n"
+"  stb 9,%1(13)\n"
+"2:\n"
+   RESTART_TABLE(1b, 2b, 1b)
+   : : "i" (offsetof(struct paca_struct, irq_happened)),
+   "i" (offsetof(struct paca_struct, irq_soft_mask))
+   : "cr0", "r9"
+   : happened);
+
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+   WARN_ON_ONCE(!(mfmsr() & MSR_EE));
+
+   return;
+
+happened:
+   irq_happened = get_irq_happened();
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+   WARN_ON_ONCE(!irq_happened);
+
+   if (irq_happened == PACA_IRQ_HARD_DIS) {
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+   WARN_ON_ONCE(mfmsr() & MSR_EE);
+   irq_soft_mask_set(IRQS_ENABLED);
+   local_paca->irq_happened = 0;
+   __hard_irq_enable();
+   return;
+   }
+
+   /* Have interrupts to replay, need to hard disable first */
+   if (!(irq_happened & PACA_IRQ_HARD_DIS)) {
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
+   if (!(mfmsr() & MSR_EE)) {
+   /*
+* An interrupt could have come in and cleared
+* MSR[EE] and set IRQ_HARD_DIS, so check
+* IRQ_HARD_DIS again and warn if it is still
+* clear.
+*/
+   irq_happened = get_irq_happened();
+   WARN_ON_ONCE(!(irq_happened & 
PACA_IRQ_HARD_DIS));
+   }
+   }
+   __hard_irq_disable();
+   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+   } else {
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
+   if (WARN_ON_ONCE(mfmsr() & MSR_EE))
+   __hard_irq_disable();
+   }
+   }
+
+   /*
+* Disable preempt here, so that the below preempt_enable will
+* perform resched if required (a replayed interrupt may set
+* need_resched).
+*/
+   preempt_disable();
+   irq_soft_mask_set(IRQS_ALL_DISABLED);
+   trace_hardirqs_off();
+
+   replay_soft_interrupts_irqrestore();
+   local_paca->irq_happened = 0;
+
+   trace_hardirqs_on();
+   irq_soft_mask_set(IRQS_ENABLED);
+   __hard_irq_enable();
+   preempt_enable();
+}
+#else
 notrace void arch_local_irq_restore(unsigned long mask)
 {
unsigned char irq_happened;
@@ -288,6 +382,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
__hard_irq_enable();
preempt_enable();
 }
+#endif
 EXPORT_SYMBOL(arch_local_irq_restore);
 
 /*
-- 
2.23.0

[PATCH v4 09/17] powerpc/64: allow alternate return locations for soft-masked interrupts

2021-06-17 Thread Nicholas Piggin

The exception table fixup adjusts a failed page fault's interrupt return
location if it was taken at an address specified in the exception table,
to a corresponding fixup handler address.

Introduce a variation of that idea which adds a fixup table for NMIs and
soft-masked asynchronous interrupts. This will be used to protect
certain critical sections that are sensitive to being clobbered by
interrupts coming in (due to using the same SPRs and/or irq soft-mask
state).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 13 +
 arch/powerpc/include/asm/ppc_asm.h   |  8 ++
 arch/powerpc/kernel/exceptions-64e.S | 37 +++--
 arch/powerpc/kernel/exceptions-64s.S | 41 
 arch/powerpc/kernel/vmlinux.lds.S| 10 +++
 arch/powerpc/lib/Makefile|  2 +-
 arch/powerpc/lib/restart_table.c | 30 
 7 files changed, 138 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/lib/restart_table.c

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index b9c510187b58..49d9a6fd1bb9 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -73,6 +73,11 @@
 #include 
 #include 
 
+#ifdef CONFIG_PPC64
+extern char __end_soft_masked[];
+unsigned long search_kernel_restart_table(unsigned long addr);
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
 static inline void srr_regs_clobbered(void)
 {
@@ -287,6 +292,14 @@ static inline void interrupt_nmi_exit_prepare(struct 
pt_regs *regs, struct inter
 * new work to do (must use irq_work for that).
 */
 
+#ifdef CONFIG_PPC64
+   if (arch_irq_disabled_regs(regs)) {
+   unsigned long rst = search_kernel_restart_table(regs->nip);
+   if (rst)
+   regs_set_return_ip(regs, rst);
+   }
+#endif
+
 #ifdef CONFIG_PPC64
if (nmi_disables_ftrace(regs))
this_cpu_set_ftrace_enabled(state->ftrace_enabled);
diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index d6739d700f0a..c9c2c36c1f8f 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -762,6 +762,14 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, 
CPU_FTR_CELL_TB_BUG, 96)
stringify_in_c(.long (_target) - . ;)   \
stringify_in_c(.previous)
 
+#define RESTART_TABLE(_start, _end, _target)   \
+   stringify_in_c(.section __restart_table,"a";)\
+   stringify_in_c(.balign 8;)  \
+   stringify_in_c(.llong (_start);)\
+   stringify_in_c(.llong (_end);)  \
+   stringify_in_c(.llong (_target);)   \
+   stringify_in_c(.previous)
+
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #define BTB_FLUSH(reg) \
lis reg,BUCSR_INIT@h;   \
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index b35c97c7082f..1b79f8a75298 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -901,6 +901,28 @@ kernel_dbg_exc:
bl  unknown_exception
b   interrupt_return
 
+.macro SEARCH_RESTART_TABLE
+   LOAD_REG_IMMEDIATE_SYM(r14, r11, __start___restart_table)
+   LOAD_REG_IMMEDIATE_SYM(r15, r11, __stop___restart_table)
+300:
+   cmpdr14,r15
+   beq 302f
+   ld  r11,0(r14)
+   cmpld   r10,r11
+   blt 301f
+   ld  r11,8(r14)
+   cmpld   r10,r11
+   bge 301f
+   ld  r11,16(r14)
+   b   303f
+301:
+   addir14,r14,24
+   b   300b
+302:
+   li  r11,0
+303:
+.endm
+
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
  * accordingly and if the interrupt is level sensitive, we hard disable
@@ -909,6 +931,9 @@ kernel_dbg_exc:
  */
 
 .macro masked_interrupt_book3e paca_irq full_mask
+   std r14,PACA_EXGEN+EX_R14(r13)
+   std r15,PACA_EXGEN+EX_R15(r13)
+
lbz r10,PACAIRQHAPPENED(r13)
.if \full_mask == 1
ori r10,r10,\paca_irq | PACA_IRQ_HARD_DIS
@@ -918,15 +943,23 @@ kernel_dbg_exc:
stb r10,PACAIRQHAPPENED(r13)
 
.if \full_mask == 1
-   rldicl  r10,r11,48,1/* clear MSR_EE */
-   rotldi  r11,r10,16
+   xorir11,r11,MSR_EE  /* clear MSR_EE */
mtspr   SPRN_SRR1,r11
.endif
 
+   mfspr   r10,SPRN_SRR0
+   SEARCH_RESTART_TABLE
+   cmpdi   r11,0
+   beq 1f
+   mtspr   SPRN_SRR0,r11   /* return to restart address */
+1:
+
lwz r11,PACA_EXGEN+EX_CR(r13)
mtcrr11
ld  r10,PACA_EXGEN+EX_R10(r13)
ld  r11,PACA_EXGEN+EX_R11(r13)
+   ld  r14,PACA_EXGEN+EX_R14(r13)
+   ld  r15,PACA_EXGEN+EX_R15(r13)
mfspr   r13,SPRN_SPRG_GEN_SCRATCH
rfi
b   .
diff --git a/arch/powerpc/kernel/exceptions-64s.S

[PATCH v4 08/17] powerpc/64s: save one more register in the masked interrupt handler

2021-06-17 Thread Nicholas Piggin

This frees up one more register (and takes advantage of that to
clean things up a little bit).

This register will be used in the following patch.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 34 
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b6e1c46c97d0..0ba8c2387aac 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2758,7 +2758,6 @@ INT_DEFINE_END(soft_nmi)
  * and run it entirely with interrupts hard disabled.
  */
 EXC_COMMON_BEGIN(soft_nmi_common)
-   mfspr   r11,SPRN_SRR0
mr  r10,r1
ld  r1,PACAEMERGSP(r13)
subir1,r1,INT_FRAME_SIZE
@@ -2793,19 +2792,24 @@ masked_Hinterrupt:
.else
 masked_interrupt:
.endif
-   lbz r11,PACAIRQHAPPENED(r13)
-   or  r11,r11,r10
-   stb r11,PACAIRQHAPPENED(r13)
+   stw r9,PACA_EXGEN+EX_CCR(r13)
+   lbz r9,PACAIRQHAPPENED(r13)
+   or  r9,r9,r10
+   stb r9,PACAIRQHAPPENED(r13)
+
+   .if ! \hsrr
cmpwi   r10,PACA_IRQ_DEC
bne 1f
-   lis r10,0x7fff
-   ori r10,r10,0x
-   mtspr   SPRN_DEC,r10
+   LOAD_REG_IMMEDIATE(r9, 0x7fff)
+   mtspr   SPRN_DEC,r9
 #ifdef CONFIG_PPC_WATCHDOG
+   lwz r9,PACA_EXGEN+EX_CCR(r13)
b   soft_nmi_common
 #else
b   2f
 #endif
+   .endif
+
 1: andi.   r10,r10,PACA_IRQ_MUST_HARD_MASK
beq 2f
xorir12,r12,MSR_EE  /* clear MSR_EE */
@@ -2814,17 +2818,19 @@ masked_interrupt:
.else
mtspr   SPRN_SRR1,r12
.endif
-   ori r11,r11,PACA_IRQ_HARD_DIS
-   stb r11,PACAIRQHAPPENED(r13)
+   ori r9,r9,PACA_IRQ_HARD_DIS
+   stb r9,PACAIRQHAPPENED(r13)
 2: /* done */
-   li  r10,0
+   li  r9,0
.if \hsrr
-   stb r10,PACAHSRR_VALID(r13)
+   stb r9,PACAHSRR_VALID(r13)
.else
-   stb r10,PACASRR_VALID(r13)
+   stb r9,PACASRR_VALID(r13)
.endif
-   ld  r10,PACA_EXGEN+EX_CTR(r13)
-   mtctr   r10
+
+   ld  r9,PACA_EXGEN+EX_CTR(r13)
+   mtctr   r9
+   lwz r9,PACA_EXGEN+EX_CCR(r13)
mtcrf   0x80,r9
std r1,PACAR1(r13)
ld  r9,PACA_EXGEN+EX_R9(r13)
-- 
2.23.0

[PATCH v4 07/17] powerpc/64s: system call avoid setting MSR[RI] until we set MSR[EE]

2021-06-17 Thread Nicholas Piggin

This extends the MSR[RI]=0 window a little further into the system
call in order to pair RI and EE enabling with a single mtmsrd.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 2 --
 arch/powerpc/kernel/interrupt_64.S   | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index a2ae14d0600e..b6e1c46c97d0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1942,8 +1942,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
mtctr   r10
bctr
.else
-   li  r10,MSR_RI
-   mtmsrd  r10,1   /* Set RI (EE=0) */
 #ifdef CONFIG_RELOCATABLE
__LOAD_HANDLER(r10, system_call_common)
mtctr   r10
diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index e17a77a2c1a1..ab6b99609d0e 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -283,9 +283,9 @@ END_BTB_FLUSH_SECTION
 * trace_hardirqs_off().
 */
li  r11,IRQS_ALL_DISABLED
-   li  r12,PACA_IRQ_HARD_DIS
+   li  r12,-1 /* Set MSR_EE and MSR_RI */
stb r11,PACAIRQSOFTMASK(r13)
-   stb r12,PACAIRQHAPPENED(r13)
+   mtmsrd  r12,1
 
/* Calling convention has r9 = orig r0, r10 = regs */
mr  r9,r0
-- 
2.23.0

[PATCH v4 06/17] powerpc/64: move interrupt return asm to interrupt_64.S

2021-06-17 Thread Nicholas Piggin

The next patch would like to move interrupt return assembly code to a low
location before general text, so move it into its own file and include via
head_64.S

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/head-64.h |   2 +-
 arch/powerpc/kernel/entry_64.S | 623 
 arch/powerpc/kernel/head_64.S  |   5 +-
 arch/powerpc/kernel/interrupt_64.S | 635 +
 4 files changed, 640 insertions(+), 625 deletions(-)
 create mode 100644 arch/powerpc/kernel/interrupt_64.S

diff --git a/arch/powerpc/include/asm/head-64.h 
b/arch/powerpc/include/asm/head-64.h
index 4cb9efa2eb21..242204e12993 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -16,7 +16,7 @@
.section ".head.data.\name\()","a",@progbits
 .endm
 .macro use_ftsec name
-   .section ".head.text.\name\()"
+   .section ".head.text.\name\()","ax",@progbits
 .endm
 
 /*
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 9a1d5e5599d3..15720f8661a1 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -32,7 +32,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -48,410 +47,7 @@
 /*
  * System calls.
  */
-   .section".toc","aw"
-SYS_CALL_TABLE:
-   .tc sys_call_table[TC],sys_call_table
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_CALL_TABLE:
-   .tc compat_sys_call_table[TC],compat_sys_call_table
-#endif
-
-/* This value is used to mark exception frames on the stack. */
-exception_marker:
-   .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER
-
.section".text"
-   .align 7
-
-.macro DEBUG_SRR_VALID srr
-#ifdef CONFIG_PPC_RFI_SRR_DEBUG
-   .ifc \srr,srr
-   mfspr   r11,SPRN_SRR0
-   ld  r12,_NIP(r1)
-100:   tdner11,r12
-   EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
-   mfspr   r11,SPRN_SRR1
-   ld  r12,_MSR(r1)
-100:   tdner11,r12
-   EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
-   .else
-   mfspr   r11,SPRN_HSRR0
-   ld  r12,_NIP(r1)
-100:   tdner11,r12
-   EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
-   mfspr   r11,SPRN_HSRR1
-   ld  r12,_MSR(r1)
-100:   tdner11,r12
-   EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
-   .endif
-#endif
-.endm
-
-#ifdef CONFIG_PPC_BOOK3S
-.macro system_call_vectored name trapnr
-   .globl system_call_vectored_\name
-system_call_vectored_\name:
-_ASM_NOKPROBE_SYMBOL(system_call_vectored_\name)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-BEGIN_FTR_SECTION
-   extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
-   bne .Ltabort_syscall
-END_FTR_SECTION_IFSET(CPU_FTR_TM)
-#endif
-   SCV_INTERRUPT_TO_KERNEL
-   mr  r10,r1
-   ld  r1,PACAKSAVE(r13)
-   std r10,0(r1)
-   std r11,_NIP(r1)
-   std r12,_MSR(r1)
-   std r0,GPR0(r1)
-   std r10,GPR1(r1)
-   std r2,GPR2(r1)
-   ld  r2,PACATOC(r13)
-   mfcrr12
-   li  r11,0
-   /* Can we avoid saving r3-r8 in common case? */
-   std r3,GPR3(r1)
-   std r4,GPR4(r1)
-   std r5,GPR5(r1)
-   std r6,GPR6(r1)
-   std r7,GPR7(r1)
-   std r8,GPR8(r1)
-   /* Zero r9-r12, this should only be required when restoring all GPRs */
-   std r11,GPR9(r1)
-   std r11,GPR10(r1)
-   std r11,GPR11(r1)
-   std r11,GPR12(r1)
-   std r9,GPR13(r1)
-   SAVE_NVGPRS(r1)
-   std r11,_XER(r1)
-   std r11,_LINK(r1)
-   std r11,_CTR(r1)
-
-   li  r11,\trapnr
-   std r11,_TRAP(r1)
-   std r12,_CCR(r1)
-   addir10,r1,STACK_FRAME_OVERHEAD
-   ld  r11,exception_marker@toc(r2)
-   std r11,-16(r10)/* "regshere" marker */
-
-BEGIN_FTR_SECTION
-   HMT_MEDIUM
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
-
-   /*
-* scv enters with MSR[EE]=1 and is immediately considered soft-masked.
-* The entry vector already sets PACAIRQSOFTMASK to IRQS_ALL_DISABLED,
-* and interrupts may be masked and pending already.
-* system_call_exception() will call trace_hardirqs_off() which means
-* interrupts could already have been blocked before trace_hardirqs_off,
-* but this is the best we can do.
-*/
-
-   /* Calling convention has r9 = orig r0, r10 = regs */
-   mr  r9,r0
-   bl  system_call_exception
-
-.Lsyscall_vectored_\name\()_exit:
-   addir4,r1,STACK_FRAME_OVERHEAD
-   li  r5,1 /* scv */
-   bl  syscall_exit_prepare
-
-   ld  r2,_CCR(r1)
-   ld  r4,_NIP(r1)
-   ld  r5,_MSR(r1)
-
-BEGIN_FTR_SECTION
-   stdcx.  r0,0,r1 /* to clear the reservation */

[PATCH v4 05/17] powerpc/64: handle MSR EE and RI in interrupt entry wrapper

2021-06-17 Thread Nicholas Piggin

Similarly to the system call change in the previous patch, the mtmsrd to
enable RI can be combined with the mtmsrd to enable EE for interrupts
which enable the latter, which tends to be the important synchronous
interrupts (i.e., page faults).

Do this by enabling EE and RI together at the beginning of the entry
wrapper if PACA_IRQ_HARD_DIS is clear, and just enabling RI if it is set
(which means something wanted EE=0).

Asynchronous interrupts set PACA_IRQ_HARD_DIS, but synchronous ones
leave it unchanged, so by default they always get EE=1 unless they
interrupt a caller that has hard disabled. When the sync interrupt
later calls interrupt_cond_local_irq_enable(), that will not require
another mtmsrd because we already enabled here.

This tends to save one mtmsrd L=1 for synchronous interrupts on 64s.
64e is conceptually unchanged, but it also sets MSR[EE]=1 now in the
interrupt wrapper for synchronous interrupts with the same code.

From: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 22 ++--
 arch/powerpc/kernel/exceptions-64s.S | 30 
 2 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 6e9d18838d56..b9c510187b58 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -126,9 +126,21 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
 #endif
 
 #ifdef CONFIG_PPC64
-   if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED)
+   bool trace_enable = false;
+
+   if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS)) {
+   if (irq_soft_mask_set_return(IRQS_DISABLED) == IRQS_ENABLED)
+   trace_enable = true;
+   } else {
+   irq_soft_mask_set(IRQS_DISABLED);
+   }
+   /* If the interrupt was taken with HARD_DIS set, don't enable MSR[EE] */
+   if (local_paca->irq_happened & PACA_IRQ_HARD_DIS)
+   __hard_RI_enable();
+   else
+   __hard_irq_enable();
+   if (trace_enable)
trace_hardirqs_off();
-   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 
if (user_mode(regs)) {
CT_WARN_ON(ct_state() != CONTEXT_USER);
@@ -175,6 +187,10 @@ static inline void interrupt_async_enter_prepare(struct 
pt_regs *regs, struct in
__ppc64_runlatch_on();
 #endif
 
+#ifdef CONFIG_PPC64
+   /* Ensure interrupt_enter_prepare does not enable MSR[EE] */
+   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+#endif
interrupt_enter_prepare(regs, state);
irq_enter();
 }
@@ -239,6 +255,8 @@ static inline void interrupt_nmi_enter_prepare(struct 
pt_regs *regs, struct inte
regs->softe = IRQS_ALL_DISABLED;
}
 
+   __hard_RI_enable();
+
/* Don't do any per-CPU operations until interrupt state is fixed */
 
if (nmi_disables_ftrace(regs)) {
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 5c18a2a3058d..a2ae14d0600e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -129,7 +129,6 @@ name:
 #define IISIDE .L_IISIDE_\name\()  /* Uses SRR0/1 not DAR/DSISR */
 #define IDAR   .L_IDAR_\name\()/* Uses DAR (or SRR0) */
 #define IDSISR .L_IDSISR_\name\()  /* Uses DSISR (or SRR1) */
-#define ISET_RI.L_ISET_RI_\name\() /* Run common code w/ 
MSR[RI]=1 */
 #define IBRANCH_TO_COMMON  .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch 
to common */
 #define IREALMODE_COMMON   .L_IREALMODE_COMMON_\name\() /* Common runs in 
realmode */
 #define IMASK  .L_IMASK_\name\()   /* IRQ soft-mask bit */
@@ -174,9 +173,6 @@ do_define_int n
.ifndef IDSISR
IDSISR=0
.endif
-   .ifndef ISET_RI
-   ISET_RI=1
-   .endif
.ifndef IBRANCH_TO_COMMON
IBRANCH_TO_COMMON=1
.endif
@@ -581,11 +577,6 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
stb r10,PACASRR_VALID(r13)
.endif
 
-   .if ISET_RI
-   li  r10,MSR_RI
-   mtmsrd  r10,1   /* Set MSR_RI */
-   .endif
-
.if ISTACK
.if IKUAP
kuap_save_amr_and_lock r9, r10, cr1, cr0
@@ -906,11 +897,6 @@ INT_DEFINE_BEGIN(system_reset)
IVEC=0x100
IAREA=PACA_EXNMI
IVIRT=0 /* no virt entry point */
-   /*
-* MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
-* being used, so a nested NMI exception would corrupt it.
-*/
-   ISET_RI=0
ISTACK=0
IKVM_REAL=1
 INT_DEFINE_END(system_reset)
@@ -991,8 +977,6 @@ EXC_COMMON_BEGIN(system_reset_common)
lhz r10,PACA_IN_NMI(r13)
addir10,r10,1
sth r10,PACA_IN_NMI(r13)
-   li  r10,MSR_RI
-   mtmsrd  r10,1
 
mr

[PATCH v4 04/17] powerpc/64s: avoid reloading (H)SRR registers if they are still valid

2021-06-17 Thread Nicholas Piggin

When an interrupt is taken, the SRR registers are set to return to where
it left off. Unless they are modified in the meantime, or the return
address or MSR are modified, there is no need to reload these registers
when returning from interrupt.

Introduce per-CPU flags that track the validity of SRR and HSRR
registers. These are cleared when returning from interrupt, when
using the registers for something else (e.g., OPAL calls), when
adjusting the return address or MSR of a context, and when context
switching (which changes the return address and MSR).

This improves the performance of interrupt returns.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig.debug |  5 ++
 arch/powerpc/include/asm/hw_irq.h  | 10 ++-
 arch/powerpc/include/asm/interrupt.h   | 12 +++
 arch/powerpc/include/asm/paca.h|  4 +
 arch/powerpc/include/asm/ptrace.h  | 74 -
 arch/powerpc/kernel/asm-offsets.c  |  4 +
 arch/powerpc/kernel/entry_64.S | 92 --
 arch/powerpc/kernel/exceptions-64s.S   | 27 +++
 arch/powerpc/kernel/fpu.S  |  4 +
 arch/powerpc/kernel/kgdb.c |  2 +-
 arch/powerpc/kernel/kprobes-ftrace.c   |  2 +-
 arch/powerpc/kernel/kprobes.c  | 10 +--
 arch/powerpc/kernel/process.c  | 26 +++---
 arch/powerpc/kernel/prom_init.c|  3 +
 arch/powerpc/kernel/rtas.c | 14 +++-
 arch/powerpc/kernel/signal.c   |  2 +-
 arch/powerpc/kernel/signal_64.c|  9 +++
 arch/powerpc/kernel/syscalls.c |  3 +-
 arch/powerpc/kernel/traps.c| 18 ++---
 arch/powerpc/kernel/vector.S   |  6 ++
 arch/powerpc/kvm/book3s_hv.c   |  3 +
 arch/powerpc/kvm/book3s_pr.c   |  2 +
 arch/powerpc/lib/sstep.c   |  4 +-
 arch/powerpc/math-emu/math.c   |  2 +-
 arch/powerpc/platforms/powernv/opal-call.c |  4 +
 arch/powerpc/platforms/pseries/hvCall.S| 29 +++
 arch/powerpc/sysdev/fsl_pci.c  |  2 +-
 27 files changed, 310 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 6342f9da4545..205cd77f321f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -84,6 +84,11 @@ config MSI_BITMAP_SELFTEST
 
 config PPC_IRQ_SOFT_MASK_DEBUG
bool "Include extra checks for powerpc irq soft masking"
+   depends on PPC64
+
+config PPC_RFI_SRR_DEBUG
+   bool "Include extra checks for RFI SRR register validity"
+   depends on PPC_BOOK3S_64
 
 config XMON
bool "Include xmon kernel debugger"
diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 56a98936a6a9..19bcef666cf6 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -389,7 +389,15 @@ static inline bool arch_irq_disabled_regs(struct pt_regs 
*regs)
return !(regs->msr & MSR_EE);
 }
 
-static inline void may_hard_irq_enable(void) { }
+static inline bool may_hard_irq_enable(void)
+{
+   return false;
+}
+
+static inline void do_hard_irq_enable(void)
+{
+   BUILD_BUG();
+}
 
 static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned 
long val)
 {
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 59f704408d65..6e9d18838d56 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -73,6 +73,18 @@
 #include 
 #include 
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline void srr_regs_clobbered(void)
+{
+   local_paca->srr_valid = 0;
+   local_paca->hsrr_valid = 0;
+}
+#else
+static inline void srr_regs_clobbered(void)
+{
+}
+#endif
+
 static inline void nap_adjust_return(struct pt_regs *regs)
 {
 #ifdef CONFIG_PPC_970_NAP
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index ec18ac818e3a..dfc984b0e640 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -169,6 +169,10 @@ struct paca_struct {
u64 saved_msr;  /* MSR saved here by enter_rtas */
 #ifdef CONFIG_PPC_BOOK3E
u16 trap_save;  /* Used when bad stack is encountered */
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   u8 hsrr_valid;  /* HSRRs set for HRFID */
+   u8 srr_valid;   /* SRRs set for RFID */
 #endif
u8 irq_soft_mask;   /* mask for irq soft masking */
u8 irq_happened;/* irq happened while soft-disabled */
diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 9c9ab2746168..516117bba4e6 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -121,27 +121,7 @@ struct pt_regs
 #endif /* __powerpc64__ */
 
 #ifndef __ASSEMBLY__
-
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
-{

[PATCH v4 03/17] powerpc/64s: introduce different functions to return from SRR vs HSRR interrupts

2021-06-17 Thread Nicholas Piggin

This makes no real difference yet except that HSRR type interrupts will
use hrfid to return. This is important for the next patch.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/entry_64.S   | 65 +---
 arch/powerpc/kernel/exceptions-64e.S |  4 ++
 arch/powerpc/kernel/exceptions-64s.S | 76 +++-
 arch/powerpc/kernel/vector.S |  2 +-
 4 files changed, 92 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 03727308d8cc..03a45a88b4b8 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -635,51 +635,57 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 * touched, no exit work created, then this can be used.
 */
.balign IFETCH_ALIGN_BYTES
-   .globl fast_interrupt_return
-fast_interrupt_return:
-_ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
+   .globl fast_interrupt_return_srr
+fast_interrupt_return_srr:
+_ASM_NOKPROBE_SYMBOL(fast_interrupt_return_srr)
kuap_check_amr r3, r4
ld  r5,_MSR(r1)
andi.   r0,r5,MSR_PR
 #ifdef CONFIG_PPC_BOOK3S
-   bne .Lfast_user_interrupt_return_amr
+   bne .Lfast_user_interrupt_return_amr_srr
kuap_kernel_restore r3, r4
andi.   r0,r5,MSR_RI
li  r3,0 /* 0 return value, no EMULATE_STACK_STORE */
-   bne+.Lfast_kernel_interrupt_return
+   bne+.Lfast_kernel_interrupt_return_srr
addir3,r1,STACK_FRAME_OVERHEAD
bl  unrecoverable_exception
b   . /* should not get here */
 #else
-   bne .Lfast_user_interrupt_return
-   b   .Lfast_kernel_interrupt_return
+   bne .Lfast_user_interrupt_return_srr
+   b   .Lfast_kernel_interrupt_return_srr
 #endif
 
+.macro interrupt_return_macro srr
.balign IFETCH_ALIGN_BYTES
-   .globl interrupt_return
-interrupt_return:
-_ASM_NOKPROBE_SYMBOL(interrupt_return)
+   .globl interrupt_return_\srr
+interrupt_return_\srr\():
+_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\())
ld  r4,_MSR(r1)
andi.   r0,r4,MSR_PR
-   beq .Lkernel_interrupt_return
+   beq .Lkernel_interrupt_return_\srr
addir3,r1,STACK_FRAME_OVERHEAD
bl  interrupt_exit_user_prepare
cmpdi   r3,0
-   bne-.Lrestore_nvgprs
+   bne-.Lrestore_nvgprs_\srr
 
 #ifdef CONFIG_PPC_BOOK3S
-.Lfast_user_interrupt_return_amr:
+.Lfast_user_interrupt_return_amr_\srr\():
kuap_user_restore r3, r4
 #endif
-.Lfast_user_interrupt_return:
+.Lfast_user_interrupt_return_\srr\():
ld  r11,_NIP(r1)
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
ld  r10,_PPR(r1)
mtspr   SPRN_PPR,r10
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+   .ifc \srr,srr
mtspr   SPRN_SRR0,r11
mtspr   SPRN_SRR1,r12
+   .else
+   mtspr   SPRN_HSRR0,r11
+   mtspr   SPRN_HSRR1,r12
+   .endif
 
 BEGIN_FTR_SECTION
stdcx.  r0,0,r1 /* to clear the reservation */
@@ -706,24 +712,33 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
REST_GPR(6, r1)
REST_GPR(0, r1)
REST_GPR(1, r1)
+   .ifc \srr,srr
RFI_TO_USER
+   .else
+   HRFI_TO_USER
+   .endif
b   .   /* prevent speculative execution */
 
-.Lrestore_nvgprs:
+.Lrestore_nvgprs_\srr\():
REST_NVGPRS(r1)
-   b   .Lfast_user_interrupt_return
+   b   .Lfast_user_interrupt_return_\srr
 
.balign IFETCH_ALIGN_BYTES
-.Lkernel_interrupt_return:
+.Lkernel_interrupt_return_\srr\():
addir3,r1,STACK_FRAME_OVERHEAD
bl  interrupt_exit_kernel_prepare
 
-.Lfast_kernel_interrupt_return:
+.Lfast_kernel_interrupt_return_\srr\():
cmpdi   cr1,r3,0
ld  r11,_NIP(r1)
ld  r12,_MSR(r1)
+   .ifc \srr,srr
mtspr   SPRN_SRR0,r11
mtspr   SPRN_SRR1,r12
+   .else
+   mtspr   SPRN_HSRR0,r11
+   mtspr   SPRN_HSRR1,r12
+   .endif
 
 BEGIN_FTR_SECTION
stdcx.  r0,0,r1 /* to clear the reservation */
@@ -757,7 +772,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
REST_GPR(6, r1)
REST_GPR(0, r1)
REST_GPR(1, r1)
+   .ifc \srr,srr
RFI_TO_KERNEL
+   .else
+   HRFI_TO_KERNEL
+   .endif
b   .   /* prevent speculative execution */
 
 1: /*
@@ -777,8 +796,18 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
std r9,0(r1) /* perform store component of stdu */
ld  r9,PACA_EXGEN+0(r13)
 
+   .ifc \srr,srr
RFI_TO_KERNEL
+   .else
+   HRFI_TO_KERNEL
+   .endif
b   .   /* prevent speculative execution */
+.endm
+
+interrupt_return_macro srr
+#ifdef CONFIG_PPC_BOOK3S
+interrupt_return_macro hsrr
+#endif
 
 #ifdef CONFIG_PPC_RTAS
 /*
diff --git a/arch/powerpc/kernel/exceptions-64e.S

[PATCH v4 02/17] powerpc: remove interrupt exit helpers unused argument

2021-06-17 Thread Nicholas Piggin

The msr argument is not used, remove it.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h | 4 ++--
 arch/powerpc/kernel/interrupt.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 1c7b75834e04..95492655462e 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -71,8 +71,8 @@ void __init machine_init(u64 dt_ptr);
 #endif
 long system_call_exception(long r3, long r4, long r5, long r6, long r7, long 
r8, unsigned long r0, struct pt_regs *regs);
 notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs 
*regs, long scv);
-notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, 
unsigned long msr);
-notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, 
unsigned long msr);
+notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs);
+notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs);
 
 long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
  u32 len_high, u32 len_low);
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index f2baeee437f7..05fa3ae56e25 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -352,7 +352,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
return ret;
 }
 
-notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, 
unsigned long msr)
+notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
 {
unsigned long ti_flags;
unsigned long flags;
@@ -431,7 +431,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs, unsigned
 
 void preempt_schedule_irq(void);
 
-notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, 
unsigned long msr)
+notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs)
 {
unsigned long flags;
unsigned long ret = 0;
-- 
2.23.0

[PATCH v4 01/17] powerpc/interrupt: Fix CONFIG ifdef typo

2021-06-17 Thread Nicholas Piggin

From: Christophe Leroy 

CONFIG_PPC_BOOK3S should be CONFIG_PPC_BOOK3S_64. restore_math is a
no-op for other configurations.

Signed-off-by: Christophe Leroy 
[np: split from another patch]
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index e0938ba298f2..f2baeee437f7 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -303,7 +303,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
ti_flags = READ_ONCE(current_thread_info()->flags);
}
 
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
+   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
unlikely((ti_flags & _TIF_RESTORE_TM))) {
restore_tm_state(regs);
-- 
2.23.0

[PATCH v4 00/17] powerpc/64: fast interrupt exits

2021-06-17 Thread Nicholas Piggin

This series attempts to improve the speed of interrupts and system calls
in three major ways.

Firstly, the SRR/HSRR registers do not need to be reloaded if they were
clobbered for the duration of the interrupt and the return NIP and MSR
did not changed. 64e does not implement this part, but it could quite
easily.

Secondly, mtmsrd and mtspr are reduced by various means, this is mostly
specific to 64s.

Lastly, an alternate interrupt return location facility is added for
soft-masked asynchronous interrupts, and then that's used to set
everything up for return without having to disable MSR RI or EE.

After this series, the entire system call / interrupt handler fast path
can execute with no mtsprs and one mtmsrd to enable interrupts
initially, and the system call vectored path doesn't even need to do
that. This gives a decent performance benefit. On POWER9 with a
powernv_defconfig without VIRT_CPU_ACCOUNTING_NATIVE, no meltdown
workarounds, gettid sc system call goes from 481 -> 344 cycles, gettid
scv 345->299 cycles, and page fault 1225->1064 cycles.

Since v3:
- Added Christophe's series to the end (with one broken out patch
  at the start and minor tweaks).
- Fix lbz of SOFTE that broke on BE, should be loaded with ld.
- Fix case where fast_interrupt_return was not marking SRRs as
  clobbered correctly, causing random userspace segfaults etc.
- Fix cpu lock inversion problem in the workaround static branch
  switching code.
- Removed the .L local name from some labels for the restart
  handlers, which makes the disassembly easier to read.
- Fixed several nitpicks found by kbuild robot (duplicate include,
  missing prototypes, etc).
- Fix 64e compile bug [from mpe].
- Fix KUAP re-locking in some restart cases.

Christophe Leroy (6):
  powerpc/interrupt: Fix CONFIG ifdef typo
  powerpc/interrupt: Rename and lightly change
syscall_exit_prepare_main()
  powerpc/interrupt: Refactor interrupt_exit_user_prepare()
  powerpc/interrupt: Interchange
prep_irq_for_{kernel_enabled/user}_exit()
  powerpc/interrupt: Refactor prep_irq_for_{user/kernel_enabled}_exit()
  powerpc/interrupt: Remove prep_irq_for_user_exit()

Nicholas Piggin (11):
  powerpc: remove interrupt exit helpers unused argument
  powerpc/64s: introduce different functions to return from SRR vs HSRR
interrupts
  powerpc/64s: avoid reloading (H)SRR registers if they are still valid
  powerpc/64: handle MSR EE and RI in interrupt entry wrapper
  powerpc/64: move interrupt return asm to interrupt_64.S
  powerpc/64s: system call avoid setting MSR[RI] until we set MSR[EE]
  powerpc/64s: save one more register in the masked interrupt handler
  powerpc/64: allow alternate return locations for soft-masked
interrupts
  powerpc/64: interrupt soft-enable race fix
  powerpc/64: treat low kernel text as irqs soft-masked
  powerpc/64: use interrupt restart table to speed up return from
interrupt

 arch/powerpc/Kconfig.debug |   5 +
 arch/powerpc/include/asm/asm-prototypes.h  |   9 +-
 arch/powerpc/include/asm/head-64.h |   2 +-
 arch/powerpc/include/asm/hw_irq.h  |  23 +-
 arch/powerpc/include/asm/interrupt.h   |  60 +-
 arch/powerpc/include/asm/paca.h|   7 +
 arch/powerpc/include/asm/ppc_asm.h |   8 +
 arch/powerpc/include/asm/ptrace.h  |  75 ++-
 arch/powerpc/kernel/asm-offsets.c  |   7 +
 arch/powerpc/kernel/entry_64.S | 516 --
 arch/powerpc/kernel/exceptions-64e.S   |  53 +-
 arch/powerpc/kernel/exceptions-64s.S   | 219 +++---
 arch/powerpc/kernel/fpu.S  |   4 +
 arch/powerpc/kernel/head_64.S  |   5 +-
 arch/powerpc/kernel/interrupt.c| 413 +++-
 arch/powerpc/kernel/interrupt_64.S | 742 +
 arch/powerpc/kernel/irq.c  |  95 +++
 arch/powerpc/kernel/kgdb.c |   2 +-
 arch/powerpc/kernel/kprobes-ftrace.c   |   2 +-
 arch/powerpc/kernel/kprobes.c  |  10 +-
 arch/powerpc/kernel/process.c  |  26 +-
 arch/powerpc/kernel/prom_init.c|   3 +
 arch/powerpc/kernel/rtas.c |  14 +-
 arch/powerpc/kernel/signal.c   |   2 +-
 arch/powerpc/kernel/signal_64.c|   9 +
 arch/powerpc/kernel/syscalls.c |   3 +-
 arch/powerpc/kernel/traps.c|  18 +-
 arch/powerpc/kernel/vector.S   |   8 +-
 arch/powerpc/kernel/vmlinux.lds.S  |  10 +
 arch/powerpc/kvm/book3s_hv.c   |   3 +
 arch/powerpc/kvm/book3s_pr.c   |   2 +
 arch/powerpc/lib/Makefile  |   2 +-
 arch/powerpc/lib/feature-fixups.c  |  52 +-
 arch/powerpc/lib/restart_table.c   |  30 +
 arch/powerpc/lib/sstep.c   |   4 +-
 arch/powerpc/math-emu/math.c   |   2 +-
 arch/powerpc/platforms/powernv/opal-call.c |   4 +
 arch/powerpc/platforms/pseries/hvCall.S|  29 +
 arch/powerpc/sysdev/fsl_pci.c

Re: [PATCH 1/1] ALSA: aoa: remove unnecessary oom message

2021-06-17 Thread Takashi Iwai

On Thu, 17 Jun 2021 12:27:45 +0200,
Zhen Lei wrote:
> 
> Fixes scripts/checkpatch.pl warning:
> WARNING: Possible unnecessary 'out of memory' message
> 
> Remove it can help us save a bit of memory.
> 
> Signed-off-by: Zhen Lei 

Thanks, applied.


Takashi

Re: [PATCH 8/8] membarrier: Rewrite sync_core_before_usermode() and improve documentation

2021-06-17 Thread Mathieu Desnoyers

- On Jun 15, 2021, at 11:21 PM, Andy Lutomirski l...@kernel.org wrote:

[...]

> +# An architecture that wants to support
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE needs to define precisely what 
> it
> +# is supposed to do and implement membarrier_sync_core_before_usermode() to
> +# make it do that.  Then it can select ARCH_HAS_MEMBARRIER_SYNC_CORE via
> +# Kconfig.Unfortunately, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is not a
> +# fantastic API and may not make sense on all architectures.  Once an
> +# architecture meets these requirements,

Can we please remove the editorial comment about the quality of the membarrier
sync-core's API ?

At least it's better than having all userspace rely on mprotect() undocumented
side-effects to perform something which typically works, until it won't, or 
until
this prevents mprotect's implementation to be improved because it will start 
breaking
JITs all over the place.

We can simply state that the definition of what membarrier sync-core does is 
defined
per-architecture, and document the sequence of operations to perform when doing
cross-modifying code specifically for each architecture.

Now if there are weird architectures where membarrier is an odd fit (I've heard 
that
riscv might need address ranges to which the core sync needs to apply), then 
those
might need to implement their own arch-specific system call, which is all fine.

> +#
> +# On x86, a program can safely modify code, issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, and then execute that code, via
> +# the modified address or an alias, from any thread in the calling process.
> +#
> +# On arm64, a program can modify code, flush the icache as needed, and issue
> +# MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE to force a "context 
> synchronizing
> +# event", aka pipeline flush on all CPUs that might run the calling process.
> +# Then the program can execute the modified code as long as it is executed
> +# from an address consistent with the icache flush and the CPU's cache type.
> +#
> +# On powerpc, a program can use MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
> +# similarly to arm64.  It would be nice if the powerpc maintainers could
> +# add a more clear explanantion.

We should document the requirements on ARMv7 as well.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

Re: [PATCH 8/8] membarrier: Rewrite sync_core_before_usermode() and improve documentation

2021-06-17 Thread Mathieu Desnoyers

- On Jun 15, 2021, at 11:21 PM, Andy Lutomirski l...@kernel.org wrote:

> The old sync_core_before_usermode() comments suggested that a 
> non-icache-syncing
> return-to-usermode instruction is x86-specific and that all other
> architectures automatically notice cross-modified code on return to
> userspace.
> 
> This is misleading.  The incantation needed to modify code from one
> CPU and execute it on another CPU is highly architecture dependent.
> On x86, according to the SDM, one must modify the code, issue SFENCE
> if the modification was WC or nontemporal, and then issue a "serializing
> instruction" on the CPU that will execute the code.  membarrier() can do
> the latter.
> 
> On arm64 and powerpc, one must flush the icache and then flush the pipeline
> on the target CPU, although the CPU manuals don't necessarily use this
> language.
> 
> So let's drop any pretense that we can have a generic way to define or
> implement membarrier's SYNC_CORE operation and instead require all
> architectures to define the helper and supply their own documentation as to
> how to use it.

Agreed. Documentation of the sequence of operations that need to be performed
when cross-modifying code on SMP should be per-architecture. The documentation
of the architectural effects of membarrier sync-core should be per-arch as well.

> This means x86, arm64, and powerpc for now.

And also arm32, as discussed in the other leg of the patchset's email thread.

> Let's also
> rename the function from sync_core_before_usermode() to
> membarrier_sync_core_before_usermode() because the precise flushing details
> may very well be specific to membarrier, and even the concept of
> "sync_core" in the kernel is mostly an x86-ism.

OK

> 
[...]
> 
> static void ipi_rseq(void *info)
> {
> @@ -368,12 +373,14 @@ static int membarrier_private_expedited(int flags, int
> cpu_id)
>   smp_call_func_t ipi_func = ipi_mb;
> 
>   if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
> - if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
> +#ifndef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
>   return -EINVAL;
> +#else
>   if (!(atomic_read(>membarrier_state) &
> MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
>   return -EPERM;
>   ipi_func = ipi_sync_core;
> +#endif

Please change back this #ifndef / #else / #endif within function for

if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) {
  ...
} else {
  ...
}

I don't think mixing up preprocessor and code logic makes it more readable.

Thanks,

Mathieu

>   } else if (flags == MEMBARRIER_FLAG_RSEQ) {
>   if (!IS_ENABLED(CONFIG_RSEQ))
>   return -EINVAL;
> --
> 2.31.1

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

Re: [PATCH v15 0/4] KASAN core changes for ppc64 radix KASAN

2021-06-17 Thread Balbir Singh

On Thu, Jun 17, 2021 at 07:30:28PM +1000, Daniel Axtens wrote:
> Building on the work of Christophe, Aneesh and Balbir, I've ported
> KASAN to 64-bit Book3S kernels running on the Radix MMU. I've been
> trying this for a while, but we keep having collisions between the
> kasan code in the mm tree and the code I want to put in to the ppc
> tree.
> 
> This series just contains the kasan core changes that we need. These
> can go in via the mm tree. I will then propose the powerpc changes for
> a later cycle. (The most recent RFC for the powerpc changes is in the
> v12 series at
> https://lore.kernel.org/linux-mm/20210615014705.2234866-1-...@axtens.net/
> )
> 
> v15 applies to next-20210611. There should be no noticeable changes to
> other platforms.
> 
> Changes since v14: Included a bunch of Reviewed-by:s, thanks
> Christophe and Marco. Cleaned up the build time error #ifdefs, thanks
> Christophe.
> 
> Changes since v13: move the MAX_PTR_PER_* definitions out of kasan and
> into pgtable.h. Add a build time error to hopefully prevent any
> confusion about when the new hook is applicable. Thanks Marco and
> Christophe.
> 
> Changes since v12: respond to Marco's review comments - clean up the
> help for ARCH_DISABLE_KASAN_INLINE, and add an arch readiness check to
> the new granule poisioning function. Thanks Marco.
> 
> Daniel Axtens (4):
>   kasan: allow an architecture to disable inline instrumentation
>   kasan: allow architectures to provide an outline readiness check
>   mm: define default MAX_PTRS_PER_* in include/pgtable.h
>   kasan: use MAX_PTRS_PER_* for early shadow tables
> 

The series seems reasonable

Reviewed-by: Balbir Singh

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V

Aneesh Kumar K.V  writes:

> David Gibson  writes:
>
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson  writes:
>>> 
>>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>> >> David Gibson  writes:
>>> >> 
>>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> .
>
>> I'm still not understanding why the latency we care about is different
>> in the two cases.  Can you give an example of when this would result
>> in different actual node assignments for the two different cases?
>
> How about the below update?
>
> With Form2 "ibm,associativity" for resources is listed as below:
>
> "ibm,associativity" property for resources in node 0, 8 and 40
> { 3, 6, 7, 0 }
> { 3, 6, 9, 8 }
> { 4, 6, 7, 0, 40}
>
> With "ibm,associativity-reference-points"  { 0x3, 0x2 }
>
> Form2 adds additional property which can be used with devices like persistence
> memory devices which would also like to be presented as memory-only NUMA 
> nodes.
>
> "ibm,associativity-memory-node-reference-point" property contains a number
> representing the domainID index to be used to find the domainID that should 
> be used
> when using the resource as memory only NUMA node. The NUMA distance 
> information
> w.r.t this domainID will take into consideration the latency of the media. A
> high latency memory device will have a large NUMA distance value assigned 
> w.r.t
> the domainID found at at "ibm,associativity-memory-node-reference-point" 
> domainID index.
>
> prop-encoded-array: An integer encoded as with encode-int specifying the 
> domainID index
>
> In the above example:
> "ibm,associativity-memory-node-reference-point"  { 0x4 }
>
> ex:
>
>--
>   |NUMA node0 |
>   |ProcA -> MEMA  |
>   | | |
>   |   | |
>   |   ---> PMEMB|
>   |   |
>---
>
>---
>   |NUMA node1 |
>   |   |
>   |ProcB ---> MEMC|
>   |   | |
>   |   ---> PMEMD|
>   |   |
>   |   |
>---
>
>  
> 
> |  domainID 20
>|
> |   ---   
>|
> |  |NUMA node0 |  
>|
> |  |   |  
>|
> |  |ProcA ---> MEMA|   |NUMA node40 | 
>|
> |  |  |  |   ||   
>  |
> |  |  -- |>  |  PMEMB |   
>  |
> |  |   |  
>|
> |  |   |  
>|
> |   ---   
>|
> | 
>|
> |   ---   
>|
> |  |NUMA node1 |  
>|
> |  |   |  
>|
> |  |ProcB ---> MEMC|   ---
>|
> |  |  |  |  |   NUMA node41 | 
>  |
> |  |  > | PMEMD | 
>  |
> |  |   |   ---
>|
> |  |   |  
>|
> |   ---   
>|
> | 
>|
>  
> 
>
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
> we want to use it as memory using dax kmem driver, we want both PMEMB
> and PMEMD to appear as memory only NUMA node at a distance that is
> derived based on the latency of the media.
>
>

Re: [PATCH v3 1/5] powerpc/interrupt: Rename and lightly change syscall_exit_prepare_main()

2021-06-17 Thread Christophe Leroy





Le 17/06/2021 à 13:25, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 15, 2021 6:33 pm:

Rename syscall_exit_prepare_main() into interrupt_exit_prepare_main()

Make it static as it is not used anywhere else.

Pass it the 'ret' so that it can 'or' it directly instead of
oring twice, once inside the function and once outside.

And remove 'r3' parameter which is not used.

Also fix a typo where CONFIG_PPC_BOOK3S should be CONFIG_PPC_BOOK3S_64.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
This series applies on top of Nic's series speeding up interrupt return on 64s

  arch/powerpc/kernel/interrupt.c | 11 +--
  1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 74c995a42399..ba2d602d2da6 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -243,11 +243,10 @@ static notrace void booke_load_dbcr0(void)
  #endif
  }
  
-notrace unsigned long syscall_exit_prepare_main(unsigned long r3,

-   struct pt_regs *regs)
+static notrace unsigned long
+interrupt_exit_user_prepare_main(struct pt_regs *regs, unsigned long ret)


Hmm, I tried switching the order of the arguments thinking it would
match caller and return registers better but didn't seem to help
generated code. Yet I think I will make that change to your patch if
you don't mind.


That's a static function that most likely gets inlined so the order of 
parameters makes no difference.
I tend to like that almost all functions dealing with interrupts take regs as first param, but I 
have no strong opinion about it so you can change it if that's better for you.


Christophe

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V

David Gibson  writes:

> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson  writes:
>> 
>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>> >> David Gibson  writes:
>> >> 
>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
.

> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?

How about the below update?

With Form2 "ibm,associativity" for resources is listed as below:

"ibm,associativity" property for resources in node 0, 8 and 40
{ 3, 6, 7, 0 }
{ 3, 6, 9, 8 }
{ 4, 6, 7, 0, 40}

With "ibm,associativity-reference-points"  { 0x3, 0x2 }

Form2 adds additional property which can be used with devices like persistence
memory devices which would also like to be presented as memory-only NUMA nodes.

"ibm,associativity-memory-node-reference-point" property contains a number
representing the domainID index to be used to find the domainID that should be 
used
when using the resource as memory only NUMA node. The NUMA distance information
w.r.t this domainID will take into consideration the latency of the media. A
high latency memory device will have a large NUMA distance value assigned w.r.t
the domainID found at at "ibm,associativity-memory-node-reference-point" 
domainID index.

prop-encoded-array: An integer encoded as with encode-int specifying the 
domainID index

In the above example:
"ibm,associativity-memory-node-reference-point"  { 0x4 }

ex:

   --
  |NUMA node0 |
  |ProcA -> MEMA  |
  | | |
  | | |
  | ---> PMEMB|
  |   |
   ---

   ---
  |NUMA node1 |
  |   |
  |ProcB ---> MEMC|
  | | |
  | ---> PMEMD|
  |   |
  |   |
   ---

|  domainID 20  
 |
|   --- 
 |
|  |NUMA node0 |
 |
|  |   |
 |
|  |ProcA ---> MEMA|   |NUMA node40 |   
 |
|  ||  |   ||   
 |
|  |-- |>  |  PMEMB |   
 |
|  |   |
 |
|  |   |
 |
|   --- 
 |
|   
 |
|   --- 
 |
|  |NUMA node1 |
 |
|  |   |
 |
|  |ProcB ---> MEMC|   ---  
 |
|  ||  |  |   NUMA node41 | 
 |
|  |> | PMEMD | 
 |
|  |   |   ---  
 |
|  |   |
 |
|   --- 
 |
|   
 |

For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media.

"ibm,associativity":
PROCA/MEMA -> { 2, 20, 0 } 
PROCB/MEMC -> { 2, 20, 1 } 
PMEMB  -> { 3, 20, 0, 40}
PMEMB  -> { 3, 20, 1, 41}

"ibm,associativity-reference-points" -> { 2, 1 }
"ibm,associativity-memory-node-reference-points" -> { 3 }

Re: [PATCH v6 0/3] Introduce 64b relocatable kernel

2021-06-17 Thread Alex Ghiti


Le 18/05/2021 à 12:12, Alexandre Ghiti a écrit :

After multiple attempts, this patchset is now based on the fact that the
64b kernel mapping was moved outside the linear mapping.

The first patch allows to build relocatable kernels but is not selected
by default. That patch should ease KASLR implementation a lot.
The second and third patches take advantage of an already existing powerpc
script that checks relocations at compile-time, and uses it for riscv.


@Palmer, any thought about that? There are no users for now, do you want 
to wait for a KASLR implementation to use it before merging this? If so, 
I can work on a KASLR implementation based on older implementation from 
Zong.


Thanks,



This patchset was tested on:

* kernel:
- rv32: OK
- rv64 with RELOCATABLE: OK and checked that "suspicious" relocations are 
caught.
- rv64 without RELOCATABLE: OK
- powerpc: build only and checked that "suspicious" relocations are caught.
  
* xipkernel:

- rv32: build only
- rv64: OK

* nommukernel:
- rv64: build only

Changes in v6:
   * Remove the kernel move to vmalloc zone
   * Rebased on top of for-next
   * Remove relocatable property from 32b kernel as the kernel is mapped in
 the linear mapping and would then need to be copied physically too
   * CONFIG_RELOCATABLE depends on !XIP_KERNEL
   * Remove Reviewed-by from first patch as it changed a bit

Changes in v5:
   * Add "static __init" to create_kernel_page_table function as reported by
 Kbuild test robot
   * Add reviewed-by from Zong
   * Rebase onto v5.7

Changes in v4:
   * Fix BPF region that overlapped with kernel's as suggested by Zong
   * Fix end of module region that could be larger than 2GB as suggested by Zong
   * Fix the size of the vm area reserved for the kernel as we could lose
 PMD_SIZE if the size was already aligned on PMD_SIZE
   * Split compile time relocations check patch into 2 patches as suggested by 
Anup
   * Applied Reviewed-by from Zong and Anup

Changes in v3:
   * Move kernel mapping to vmalloc

Changes in v2:
   * Make RELOCATABLE depend on MMU as suggested by Anup
   * Rename kernel_load_addr into kernel_virt_addr as suggested by Anup
   * Use __pa_symbol instead of __pa, as suggested by Zong
   * Rebased on top of v5.6-rc3
   * Tested with sv48 patchset
   * Add Reviewed/Tested-by from Zong and Anup

Alexandre Ghiti (3):
   riscv: Introduce CONFIG_RELOCATABLE
   powerpc: Move script to check relocations at compile time in scripts/
   riscv: Check relocations at compile time

  arch/powerpc/tools/relocs_check.sh | 18 ++
  arch/riscv/Kconfig | 12 +++
  arch/riscv/Makefile|  5 ++-
  arch/riscv/Makefile.postlink   | 36 
  arch/riscv/kernel/vmlinux.lds.S|  6 
  arch/riscv/mm/Makefile |  4 +++
  arch/riscv/mm/init.c   | 53 +-
  arch/riscv/tools/relocs_check.sh   | 26 +++
  scripts/relocs_check.sh| 20 +++
  9 files changed, 162 insertions(+), 18 deletions(-)
  create mode 100644 arch/riscv/Makefile.postlink
  create mode 100755 arch/riscv/tools/relocs_check.sh
  create mode 100755 scripts/relocs_check.sh

[PATCH v3 3/4] powerpc/papr_scm: Add perf interface support

2021-06-17 Thread Kajol Jain

Performance monitoring support for papr-scm nvdimm devices
via perf interface is added which includes addition of pmu
functions like add/del/read/event_init for nvdimm_pmu struture.

A new parameter 'priv' in added to the pdev_archdata structure to save
nvdimm_pmu device pointer, to handle the unregistering of pmu device.

papr_scm_pmu_register function populates the nvdimm_pmu structure
with events, cpumask, attribute groups along with event handling
functions. Finally the populated nvdimm_pmu structure is passed to
register the pmu device.
Event handling functions internally uses hcall to get events and
counter data.

Result in power9 machine with 2 nvdimm device:

Ex: List all event by perf list

command:# perf list nmem

  nmem0/cchrhcnt/[Kernel PMU event]
  nmem0/cchwhcnt/[Kernel PMU event]
  nmem0/critrscu/[Kernel PMU event]
  nmem0/ctlresct/[Kernel PMU event]
  nmem0/ctlrestm/[Kernel PMU event]
  nmem0/fastwcnt/[Kernel PMU event]
  nmem0/hostlcnt/[Kernel PMU event]
  nmem0/hostldur/[Kernel PMU event]
  nmem0/hostscnt/[Kernel PMU event]
  nmem0/hostsdur/[Kernel PMU event]
  nmem0/medrcnt/ [Kernel PMU event]
  nmem0/medrdur/ [Kernel PMU event]
  nmem0/medwcnt/ [Kernel PMU event]
  nmem0/medwdur/ [Kernel PMU event]
  nmem0/memlife/ [Kernel PMU event]
  nmem0/noopstat/[Kernel PMU event]
  nmem0/ponsecs/ [Kernel PMU event]
  nmem1/cchrhcnt/[Kernel PMU event]
  nmem1/cchwhcnt/[Kernel PMU event]
  nmem1/critrscu/[Kernel PMU event]
  ...
  nmem1/noopstat/[Kernel PMU event]
  nmem1/ponsecs/ [Kernel PMU event]

Tested-by: Nageswara R Sastry 
Signed-off-by: Kajol Jain 
---
 arch/powerpc/include/asm/device.h |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c | 365 ++
 2 files changed, 370 insertions(+)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 219559d65864..47ed639f3b8f 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -48,6 +48,11 @@ struct dev_archdata {
 
 struct pdev_archdata {
u64 dma_mask;
+   /*
+* Pointer to nvdimm_pmu structure, to handle the unregistering
+* of pmu device
+*/
+   void *priv;
 };
 
 #endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index ef26fe40efb0..92632b4a4a60 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -67,6 +69,8 @@
 #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
 #define PAPR_SCM_PERF_STATS_VERSION 0x1
 
+#define to_nvdimm_pmu(_pmu)container_of(_pmu, struct nvdimm_pmu, pmu)
+
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
u8 stat_id[8];
@@ -116,6 +120,12 @@ struct papr_scm_priv {
 
/* length of the stat buffer as expected by phyp */
size_t stat_buffer_len;
+
+/* array to have event_code and stat_id mappings */
+   char **nvdimm_events_map;
+
+   /* count of supported events */
+   u32 total_events;
 };
 
 static int papr_scm_pmem_flush(struct nd_region *nd_region,
@@ -329,6 +339,354 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv 
*p,
return 0;
 }
 
+PMU_FORMAT_ATTR(event, "config:0-4");
+
+static struct attribute *nvdimm_pmu_format_attr[] = {
+   _attr_event.attr,
+   NULL,
+};
+
+static struct attribute_group nvdimm_pmu_format_group = {
+   .name = "format",
+   .attrs = nvdimm_pmu_format_attr,
+};
+
+static int papr_scm_pmu_get_value(struct perf_event *event, struct device 
*dev, u64 *count)
+{
+   struct papr_scm_perf_stat *stat;
+   struct papr_scm_perf_stats *stats;
+   struct papr_scm_priv *p = (struct papr_scm_priv *)dev->driver_data;
+   int rc, size;
+
+   /* Allocate request buffer enough to hold single performance stat */
+   size = sizeof(struct papr_scm_perf_stats) +
+   sizeof(struct papr_scm_perf_stat);
+
+   if (!p || !p->nvdimm_events_map)
+   return -EINVAL;
+
+   stats = kzalloc(size,

[PATCH v3 0/4] Add perf interface to expose nvdimm

2021-06-17 Thread Kajol Jain

Patchset adds performance stats reporting support for nvdimm.
Added interface includes support for pmu register/unregister
functions. A structure is added called nvdimm_pmu to be used for
adding arch/platform specific data such as supported events, cpumask
pmu event functions like event_init/add/read/del.
User could use the standard perf tool to access perf
events exposed via pmu.

Added implementation to expose IBM pseries platform nmem*
device performance stats using this interface.

Result from power9 pseries lpar with 2 nvdimm device:
command:# perf list nmem
  nmem0/cchrhcnt/[Kernel PMU event]
  nmem0/cchwhcnt/[Kernel PMU event]
  nmem0/critrscu/[Kernel PMU event]
  nmem0/ctlresct/[Kernel PMU event]
  nmem0/ctlrestm/[Kernel PMU event]
  nmem0/fastwcnt/[Kernel PMU event]
  nmem0/hostlcnt/[Kernel PMU event]
  nmem0/hostldur/[Kernel PMU event]
  nmem0/hostscnt/[Kernel PMU event]
  nmem0/hostsdur/[Kernel PMU event]
  nmem0/medrcnt/ [Kernel PMU event]
  nmem0/medrdur/ [Kernel PMU event]
  nmem0/medwcnt/ [Kernel PMU event]
  nmem0/medwdur/ [Kernel PMU event]
  nmem0/memlife/ [Kernel PMU event]
  nmem0/noopstat/[Kernel PMU event]
  nmem0/ponsecs/ [Kernel PMU event]
  nmem1/cchrhcnt/[Kernel PMU event]
  nmem1/cchwhcnt/[Kernel PMU event]
  nmem1/critrscu/[Kernel PMU event]
  ...
  nmem1/noopstat/[Kernel PMU event]
  nmem1/ponsecs/ [Kernel PMU event]

Patch1:
Introduces the nvdimm_pmu structure
Patch2:
Adds common interface to add arch/platform specific data
includes supported events, pmu event functions. It also
adds code for cpu hotplug support.
Patch3:
Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
nmem* pmu. It fills in the nvdimm_pmu structure with event attrs
cpumask andevent functions and then registers the pmu by adding
callbacks to register_nvdimm_pmu.
Patch4:
Sysfs documentation patch

Changelog
---
v2 -> v3
- Added Tested-by tag.

- Fix nvdimm mailing list in the ABI Documentation.

- Link to the patchset v2: https://lkml.org/lkml/2021/6/14/25

v1 -> v2
- Fix hotplug code by adding pmu migration call
  incase current designated cpu got offline. As
  pointed by Peter Zijlstra.

- Removed the retun -1 part from cpu hotplug offline
  function.

- Link to the patchset v1: https://lkml.org/lkml/2021/6/8/500
---
Kajol Jain (4):
  drivers/nvdimm: Add nvdimm pmu structure
  drivers/nvdimm: Add perf interface to expose nvdimm performance stats
  powerpc/papr_scm: Add perf interface support
  powerpc/papr_scm: Document papr_scm sysfs event format entries

 Documentation/ABI/testing/sysfs-bus-papr-pmem |  31 ++
 arch/powerpc/include/asm/device.h |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c | 365 ++
 drivers/nvdimm/Makefile   |   1 +
 drivers/nvdimm/nd_perf.c  | 230 +++
 include/linux/nd.h|  46 +++
 6 files changed, 678 insertions(+)
 create mode 100644 drivers/nvdimm/nd_perf.c

-- 
2.27.0

[PATCH v3 4/4] powerpc/papr_scm: Document papr_scm sysfs event format entries

2021-06-17 Thread Kajol Jain

Details is added for the event, cpumask and format attributes
in the ABI documentation.

Tested-by: Nageswara R Sastry 
Signed-off-by: Kajol Jain 
---
 Documentation/ABI/testing/sysfs-bus-papr-pmem | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem 
b/Documentation/ABI/testing/sysfs-bus-papr-pmem
index 92e2db0e2d3d..be91de341454 100644
--- a/Documentation/ABI/testing/sysfs-bus-papr-pmem
+++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem
@@ -59,3 +59,34 @@ Description:
* "CchRHCnt" : Cache Read Hit Count
* "CchWHCnt" : Cache Write Hit Count
* "FastWCnt" : Fast Write Count
+
+What:  /sys/devices/nmemX/format
+Date:  June 2021
+Contact:   linuxppc-dev , 
nvd...@lists.linux.dev,
+Description:   (RO) Attribute group to describe the magic bits
+that go into perf_event_attr.config for a particular pmu.
+(See ABI/testing/sysfs-bus-event_source-devices-format).
+
+Each attribute under this group defines a bit range of the
+perf_event_attr.config. Supported attribute is listed
+below::
+
+   event  = "config:0-4"  - event ID
+
+   For example::
+   noopstat = "event=0x1"
+
+What:  /sys/devices/nmemX/events
+Date:  June 2021
+Contact:   linuxppc-dev , 
nvd...@lists.linux.dev,
+Description:(RO) Attribute group to describe performance monitoring
+events specific to papr-scm. Each attribute in this group 
describes
+a single performance monitoring event supported by this nvdimm 
pmu.
+The name of the file is the name of the event.
+(See ABI/testing/sysfs-bus-event_source-devices-events).
+
+What:  /sys/devices/nmemX/cpumask
+Date:  June 2021
+Contact:   linuxppc-dev , 
nvd...@lists.linux.dev,
+Description:   (RO) This sysfs file exposes the cpumask which is designated to 
make
+HCALLs to retrieve nvdimm pmu event counter data.
-- 
2.27.0

[PATCH v3 2/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats

2021-06-17 Thread Kajol Jain

A common interface is added to get performance stats reporting
support for nvdimm devices. Added interface includes support for
pmu register/unregister functions, cpu hotplug and pmu event
functions like event_init/add/read/del.
User could use the standard perf tool to access perf
events exposed via pmu.

Tested-by: Nageswara R Sastry 
Signed-off-by: Kajol Jain 
---
 drivers/nvdimm/Makefile  |   1 +
 drivers/nvdimm/nd_perf.c | 230 +++
 include/linux/nd.h   |   3 +
 3 files changed, 234 insertions(+)
 create mode 100644 drivers/nvdimm/nd_perf.c

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 29203f3d3069..25dba6095612 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -18,6 +18,7 @@ nd_e820-y := e820.o
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
+libnvdimm-y += nd_perf.o
 libnvdimm-y += dimm.o
 libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
new file mode 100644
index ..d4e9b9306043
--- /dev/null
+++ b/drivers/nvdimm/nd_perf.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nd_perf.c: NVDIMM Device Performance Monitoring Unit support
+ *
+ * Perf interface to expose nvdimm performance stats.
+ *
+ * Copyright (C) 2021 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "nvdimm_pmu: " fmt
+
+#include 
+
+static ssize_t nvdimm_pmu_cpumask_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct pmu *pmu = dev_get_drvdata(dev);
+   struct nvdimm_pmu *nd_pmu;
+
+   nd_pmu = container_of(pmu, struct nvdimm_pmu, pmu);
+
+   return cpumap_print_to_pagebuf(true, buf, cpumask_of(nd_pmu->cpu));
+}
+
+static int nvdimm_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+   struct nvdimm_pmu *nd_pmu;
+   u32 target;
+   int nodeid;
+   const struct cpumask *cpumask;
+
+   nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+   /* Clear it, incase given cpu is set in nd_pmu->arch_cpumask */
+   cpumask_test_and_clear_cpu(cpu, _pmu->arch_cpumask);
+
+   /*
+* If given cpu is not same as current designated cpu for
+* counter access, just return.
+*/
+   if (cpu != nd_pmu->cpu)
+   return 0;
+
+   /* Check for any active cpu in nd_pmu->arch_cpumask */
+   target = cpumask_any(_pmu->arch_cpumask);
+
+   /*
+* Incase we don't have any active cpu in nd_pmu->arch_cpumask,
+* check in given cpu's numa node list.
+*/
+   if (target >= nr_cpu_ids) {
+   nodeid = cpu_to_node(cpu);
+   cpumask = cpumask_of_node(nodeid);
+   target = cpumask_any_but(cpumask, cpu);
+   }
+   nd_pmu->cpu = target;
+
+   /* Migrate nvdimm pmu events to the new target cpu if valid */
+   if (target >= 0 && target < nr_cpu_ids)
+   perf_pmu_migrate_context(_pmu->pmu, cpu, target);
+
+   return 0;
+}
+
+static int nvdimm_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+   struct nvdimm_pmu *nd_pmu;
+
+   nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+   if (nd_pmu->cpu >= nr_cpu_ids)
+   nd_pmu->cpu = cpu;
+
+   return 0;
+}
+
+static int create_cpumask_attr_group(struct nvdimm_pmu *nd_pmu)
+{
+   struct perf_pmu_events_attr *attr;
+   struct attribute **attrs;
+   struct attribute_group *nvdimm_pmu_cpumask_group;
+
+   attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+   if (!attr)
+   return -ENOMEM;
+
+   attrs = kzalloc(2 * sizeof(struct attribute *), GFP_KERNEL);
+   if (!attrs) {
+   kfree(attr);
+   return -ENOMEM;
+   }
+
+   /* Allocate memory for cpumask attribute group */
+   nvdimm_pmu_cpumask_group = kzalloc(sizeof(*nvdimm_pmu_cpumask_group), 
GFP_KERNEL);
+   if (!nvdimm_pmu_cpumask_group) {
+   kfree(attr);
+   kfree(attrs);
+   return -ENOMEM;
+   }
+
+   sysfs_attr_init(>attr.attr);
+   attr->attr.attr.name = "cpumask";
+   attr->attr.attr.mode = 0444;
+   attr->attr.show = nvdimm_pmu_cpumask_show;
+   attrs[0] = >attr.attr;
+   attrs[1] = NULL;
+
+   nvdimm_pmu_cpumask_group->attrs = attrs;
+   nd_pmu->attr_groups[NVDIMM_PMU_CPUMASK_ATTR] = nvdimm_pmu_cpumask_group;
+   return 0;
+}
+
+static int nvdimm_pmu_cpu_hotplug_init(struct nvdimm_pmu *nd_pmu)
+{
+   int nodeid, rc;
+   const struct cpumask *cpumask;
+
+   /*
+* Incase cpu hotplug is not handled by arch specific code
+* they can still provide required cpumask which can be used
+* to get designatd cpu for counter access.
+* Check for any active cpu in nd_pmu->arch_cpumask.
+*/
+   if (!cpumask_empty(_pmu->arch_cpumask)) {
+

[PATCH v3 1/4] drivers/nvdimm: Add nvdimm pmu structure

2021-06-17 Thread Kajol Jain

A structure is added, called nvdimm_pmu, for performance
stats reporting support of nvdimm devices. It can be used to add
nvdimm pmu data such as supported events and pmu event functions
like event_init/add/read/del with cpu hotplug support.

Tested-by: Nageswara R Sastry 
Signed-off-by: Kajol Jain 
---
 include/linux/nd.h | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/include/linux/nd.h b/include/linux/nd.h
index ee9ad76afbba..712499cf7335 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -8,6 +8,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 enum nvdimm_event {
NVDIMM_REVALIDATE_POISON,
@@ -23,6 +25,47 @@ enum nvdimm_claim_class {
NVDIMM_CCLASS_UNKNOWN,
 };
 
+/* Event attribute array index */
+#define NVDIMM_PMU_FORMAT_ATTR 0
+#define NVDIMM_PMU_EVENT_ATTR  1
+#define NVDIMM_PMU_CPUMASK_ATTR2
+#define NVDIMM_PMU_NULL_ATTR   3
+
+/**
+ * struct nvdimm_pmu - data structure for nvdimm perf driver
+ *
+ * @name: name of the nvdimm pmu device.
+ * @pmu: pmu data structure for nvdimm performance stats.
+ * @dev: nvdimm device pointer.
+ * @functions(event_init/add/del/read): platform specific pmu functions.
+ * @attr_groups: data structure for events, formats and cpumask
+ * @cpu: designated cpu for counter access.
+ * @node: node for cpu hotplug notifier link.
+ * @cpuhp_state: state for cpu hotplug notification.
+ * @arch_cpumask: cpumask to get designated cpu for counter access.
+ */
+struct nvdimm_pmu {
+   const char *name;
+   struct pmu pmu;
+   struct device *dev;
+   int (*event_init)(struct perf_event *event);
+   int  (*add)(struct perf_event *event, int flags);
+   void (*del)(struct perf_event *event, int flags);
+   void (*read)(struct perf_event *event);
+   /*
+* Attribute groups for the nvdimm pmu. Index 0 used for
+* format attribute, index 1 used for event attribute,
+* index 2 used for cpusmask attribute and index 3 kept as NULL.
+*/
+   const struct attribute_group *attr_groups[4];
+   int cpu;
+   struct hlist_node node;
+   enum cpuhp_state cpuhp_state;
+
+   /* cpumask provided by arch/platform specific code */
+   struct cpumask arch_cpumask;
+};
+
 struct nd_device_driver {
struct device_driver drv;
unsigned long type;
-- 
2.27.0

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V


On 6/17/21 4:41 PM, Aneesh Kumar K.V wrote:

Daniel Henrique Barboza  writes:


On 6/17/21 4:46 AM, David Gibson wrote:

On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:

FORM2 introduce a concept of secondary domain which is identical to the
conceept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. For DAX kmem use the logical domain
id introduced in FORM2. This new numa node

Signed-off-by: Aneesh Kumar K.V 
---
   arch/powerpc/mm/numa.c| 28 +++
   arch/powerpc/platforms/pseries/papr_scm.c | 26 +
   arch/powerpc/platforms/pseries/pseries.h  |  1 +
   3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 86cd2af014f7..b9ac6d02e944 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
*associativity)
return nid;
   }
   
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)

+{
+   int secondary_index;
+   const __be32 *associativity;
+
+   if (!numa_enabled) {
+   *primary = NUMA_NO_NODE;
+   *secondary = NUMA_NO_NODE;
+   return 0;
+   }
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return -ENODEV;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   *primary = of_read_number([primary_domain_index], 
1);
+   secondary_index = of_read_number(_ref_points[1], 1);


Secondary ID is always the second reference point, but primary depends
on the length of resources?  That seems very weird.


primary_domain_index is distance_ref_point[0]. With Form2 we would find
both primary and secondary domain ID same for all resources other than
persistent memory device. The usage w.r.t. persistent memory is
explained in patch 7.


Right, I misunderstood



With Form2 the primary domainID and secondary domainID are used to identify the 
NUMA nodes
the kernel should use when using persistent memory devices.


This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.


They are essentially domainIDs. The interpretation of them are different
between Form1 and Form2. Hence I kept referring to them as primary and
secondary domainID. Any suggestion on what to name them with Form2?


My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.



I agree that this reuse can be confusing. I could argue that there is
precedent for that in PAPR - FORM0 puts a different spin on the same
property as well - but there is no need to keep following existing PAPR
practices in new spec (and some might argue it's best not to).

As far as QEMU goes, renaming this property to "numa-associativity-mode"
(just an example) is a quick change to do since we separated FORM1 and FORM2
code over there.

Doing such a rename can also help with the issue of having to describe new
FORM2 semantics using "least significant boundary" or "primary domain" or
any FORM0|FORM1 related terminology.



It is not just changing the name, we will then have to explain the
meaning of ibm,associativity-reference-points with FORM2 right?

With FORM2 we want to represent the topology better

  

| domainID 20   
 |
|   --- 
 |
|  |NUMA node1 |
 |
|  |   |
 |
|  |ProcB ---> MEMC|   |NUMA node40 |   
 |
|  ||  |   ||   
 |
|  |-- |>  |  PMEMD |   
 |
|  |   |
 |
|  |   |
 |
|   --- 
 |
  


ibm,associativity:
 { 20, 1, 40}  -> PMEMD
 { 20, 1, 1}  -> PROCB/MEMC

is the

Re: [PATCH v3 1/5] powerpc/interrupt: Rename and lightly change syscall_exit_prepare_main()

2021-06-17 Thread Nicholas Piggin

Excerpts from Christophe Leroy's message of June 15, 2021 6:33 pm:
> Rename syscall_exit_prepare_main() into interrupt_exit_prepare_main()
> 
> Make it static as it is not used anywhere else.
> 
> Pass it the 'ret' so that it can 'or' it directly instead of
> oring twice, once inside the function and once outside.
> 
> And remove 'r3' parameter which is not used.
> 
> Also fix a typo where CONFIG_PPC_BOOK3S should be CONFIG_PPC_BOOK3S_64.
> 
> Signed-off-by: Christophe Leroy 
> Reviewed-by: Nicholas Piggin 
> ---
> This series applies on top of Nic's series speeding up interrupt return on 64s
> 
>  arch/powerpc/kernel/interrupt.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
> index 74c995a42399..ba2d602d2da6 100644
> --- a/arch/powerpc/kernel/interrupt.c
> +++ b/arch/powerpc/kernel/interrupt.c
> @@ -243,11 +243,10 @@ static notrace void booke_load_dbcr0(void)
>  #endif
>  }
>  
> -notrace unsigned long syscall_exit_prepare_main(unsigned long r3,
> - struct pt_regs *regs)
> +static notrace unsigned long
> +interrupt_exit_user_prepare_main(struct pt_regs *regs, unsigned long ret)

Hmm, I tried switching the order of the arguments thinking it would 
match caller and return registers better but didn't seem to help
generated code. Yet I think I will make that change to your patch if
you don't mind.

Thanks,
Nick

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V

Daniel Henrique Barboza  writes:

> On 6/17/21 4:46 AM, David Gibson wrote:
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson  writes:
>>>
 On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
>
>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>> FORM2 introduce a concept of secondary domain which is identical to the
>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>> when using persistent memory device. For DAX kmem use the logical domain
>>> id introduced in FORM2. This new numa node
>>>
>>> Signed-off-by: Aneesh Kumar K.V 
>>> ---
>>>   arch/powerpc/mm/numa.c| 28 +++
>>>   arch/powerpc/platforms/pseries/papr_scm.c | 26 +
>>>   arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>   3 files changed, 45 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index 86cd2af014f7..b9ac6d02e944 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
>>> *associativity)
>>> return nid;
>>>   }
>>>   
>>> +int get_primary_and_secondary_domain(struct device_node *node, int 
>>> *primary, int *secondary)
>>> +{
>>> +   int secondary_index;
>>> +   const __be32 *associativity;
>>> +
>>> +   if (!numa_enabled) {
>>> +   *primary = NUMA_NO_NODE;
>>> +   *secondary = NUMA_NO_NODE;
>>> +   return 0;
>>> +   }
>>> +
>>> +   associativity = of_get_associativity(node);
>>> +   if (!associativity)
>>> +   return -ENODEV;
>>> +
>>> +   if (of_read_number(associativity, 1) >= primary_domain_index) {
>>> +   *primary = 
>>> of_read_number([primary_domain_index], 1);
>>> +   secondary_index = 
>>> of_read_number(_ref_points[1], 1);
>>
>> Secondary ID is always the second reference point, but primary depends
>> on the length of resources?  That seems very weird.
>
> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> both primary and secondary domain ID same for all resources other than
> persistent memory device. The usage w.r.t. persistent memory is
> explained in patch 7.

 Right, I misunderstood

>
> With Form2 the primary domainID and secondary domainID are used to 
> identify the NUMA nodes
> the kernel should use when using persistent memory devices.

 This seems kind of bogus.  With Form1, the primary/secondary ID are a
 sort of heirarchy of distance (things with same primary ID are very
 close, things with same secondary are kinda-close, etc.).  With Form2,
 it's referring to their effective node for different purposes.

 Using the same terms for different meanings seems unnecessarily
 confusing.
>>>
>>> They are essentially domainIDs. The interpretation of them are different
>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>> secondary domainID. Any suggestion on what to name them with Form2?
>> 
>> My point is that reusing associativity-reference-points for something
>> with completely unrelated semantics seems like a very poor choice.
>
>
> I agree that this reuse can be confusing. I could argue that there is
> precedent for that in PAPR - FORM0 puts a different spin on the same
> property as well - but there is no need to keep following existing PAPR
> practices in new spec (and some might argue it's best not to).
>
> As far as QEMU goes, renaming this property to "numa-associativity-mode"
> (just an example) is a quick change to do since we separated FORM1 and FORM2
> code over there.
>
> Doing such a rename can also help with the issue of having to describe new
> FORM2 semantics using "least significant boundary" or "primary domain" or
> any FORM0|FORM1 related terminology.
>

It is not just changing the name, we will then have to explain the
meaning of ibm,associativity-reference-points with FORM2 right?

With FORM2 we want to represent the topology better

 

| domainID 20   
 |
|   --- 
 |
|  |NUMA node1 |
 |
|  |   |
 |
|  |ProcB ---> MEMC|   |NUMA node40 |   
 |
|  ||  |   ||   
 |
|  |--

Re: [PATCH v14 4/4] kasan: use MAX_PTRS_PER_* for early shadow tables

2021-06-17 Thread Marco Elver

On Thu, 17 Jun 2021 at 08:40, Daniel Axtens  wrote:
>
> powerpc has a variable number of PTRS_PER_*, set at runtime based
> on the MMU that the kernel is booted under.
>
> This means the PTRS_PER_* are no longer constants, and therefore
> breaks the build. Switch to using MAX_PTRS_PER_*, which are constant.
>
> Suggested-by: Christophe Leroy 
> Suggested-by: Balbir Singh 
> Reviewed-by: Christophe Leroy 
> Reviewed-by: Balbir Singh 
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marco Elver 


> ---
>  include/linux/kasan.h | 6 +++---
>  mm/kasan/init.c   | 6 +++---
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 768d7d342757..5310e217bd74 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -41,9 +41,9 @@ struct kunit_kasan_expectation {
>  #endif
>
>  extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
> -extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
> -extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
> -extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
> +extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
> +extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
> +extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
>  extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
>
>  int kasan_populate_early_shadow(const void *shadow_start,
> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
> index 348f31d15a97..cc64ed6858c6 100644
> --- a/mm/kasan/init.c
> +++ b/mm/kasan/init.c
> @@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
>  }
>  #endif
>  #if CONFIG_PGTABLE_LEVELS > 3
> -pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
> +pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
>  static inline bool kasan_pud_table(p4d_t p4d)
>  {
> return p4d_page(p4d) == 
> virt_to_page(lm_alias(kasan_early_shadow_pud));
> @@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
>  }
>  #endif
>  #if CONFIG_PGTABLE_LEVELS > 2
> -pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
> +pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
>  static inline bool kasan_pmd_table(pud_t pud)
>  {
> return pud_page(pud) == 
> virt_to_page(lm_alias(kasan_early_shadow_pmd));
> @@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
> return false;
>  }
>  #endif
> -pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
> +pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
> __page_aligned_bss;
>
>  static inline bool kasan_pte_table(pmd_t pmd)
> --
> 2.30.2
>

Re: [PATCH v14 3/4] mm: define default MAX_PTRS_PER_* in include/pgtable.h

2021-06-17 Thread Marco Elver

On Thu, 17 Jun 2021 at 08:40, Daniel Axtens  wrote:
>
> Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
> made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
> constant for cases which need a compile-time constant (e.g. fixed-size
> arrays).
>
> powerpc likewise has boot-time selectable MMU features which can cause
> other mm "constants" to vary. For KASAN, we have some static
> PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
> constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions
> in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless
> an architecture has defined MAX_PTRS_PER_x in its arch headers.
>
> Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
> we're at it: both can just pick up the default now.
>
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marco Elver 


> ---
>
> s390 was compile tested only.
> ---
>  arch/s390/include/asm/pgtable.h |  2 --
>  include/asm-generic/pgtable-nop4d.h |  1 -
>  include/linux/pgtable.h | 22 ++
>  3 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 7c66ae5d7e32..cf05954ce013 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -342,8 +342,6 @@ static inline int is_module_addr(void *addr)
>  #define PTRS_PER_P4D   _CRST_ENTRIES
>  #define PTRS_PER_PGD   _CRST_ENTRIES
>
> -#define MAX_PTRS_PER_P4D   PTRS_PER_P4D
> -
>  /*
>   * Segment table and region3 table entry encoding
>   * (R = read-only, I = invalid, y = young bit):
> diff --git a/include/asm-generic/pgtable-nop4d.h 
> b/include/asm-generic/pgtable-nop4d.h
> index ce2cbb3c380f..2f6b1befb129 100644
> --- a/include/asm-generic/pgtable-nop4d.h
> +++ b/include/asm-generic/pgtable-nop4d.h
> @@ -9,7 +9,6 @@
>  typedef struct { pgd_t pgd; } p4d_t;
>
>  #define P4D_SHIFT  PGDIR_SHIFT
> -#define MAX_PTRS_PER_P4D   1
>  #define PTRS_PER_P4D   1
>  #define P4D_SIZE   (1UL << P4D_SHIFT)
>  #define P4D_MASK   (~(P4D_SIZE-1))
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 9e6f71265f72..69700e3e615f 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1625,4 +1625,26 @@ typedef unsigned int pgtbl_mod_mask;
>  #define pte_leaf_size(x) PAGE_SIZE
>  #endif
>
> +/*
> + * Some architectures have MMUs that are configurable or selectable at boot
> + * time. These lead to variable PTRS_PER_x. For statically allocated arrays 
> it
> + * helps to have a static maximum value.
> + */
> +
> +#ifndef MAX_PTRS_PER_PTE
> +#define MAX_PTRS_PER_PTE PTRS_PER_PTE
> +#endif
> +
> +#ifndef MAX_PTRS_PER_PMD
> +#define MAX_PTRS_PER_PMD PTRS_PER_PMD
> +#endif
> +
> +#ifndef MAX_PTRS_PER_PUD
> +#define MAX_PTRS_PER_PUD PTRS_PER_PUD
> +#endif
> +
> +#ifndef MAX_PTRS_PER_P4D
> +#define MAX_PTRS_PER_P4D PTRS_PER_P4D
> +#endif
> +
>  #endif /* _LINUX_PGTABLE_H */
> --
> 2.30.2
>
> --
> You received this message because you are subscribed to the Google Groups 
> "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kasan-dev+unsubscr...@googlegroups.com.
> To view this discussion on the web visit 
> https://groups.google.com/d/msgid/kasan-dev/20210617063956.94061-4-dja%40axtens.net.

Re: [PATCH v14 1/4] kasan: allow an architecture to disable inline instrumentation

2021-06-17 Thread Marco Elver

On Thu, 17 Jun 2021 at 08:40, Daniel Axtens  wrote:
>
> For annoying architectural reasons, it's very difficult to support inline
> instrumentation on powerpc64.*
>
> Add a Kconfig flag to allow an arch to disable inline. (It's a bit
> annoying to be 'backwards', but I'm not aware of any way to have
> an arch force a symbol to be 'n', rather than 'y'.)
>
> We also disable stack instrumentation in this case as it does things that
> are functionally equivalent to inline instrumentation, namely adding
> code that touches the shadow directly without going through a C helper.
>
> * on ppc64 atm, the shadow lives in virtual memory and isn't accessible in
> real mode. However, before we turn on virtual memory, we parse the device
> tree to determine which platform and MMU we're running under. That calls
> generic DT code, which is instrumented. Inline instrumentation in DT would
> unconditionally attempt to touch the shadow region, which we won't have
> set up yet, and would crash. We can make outline mode wait for the arch to
> be ready, but we can't change what the compiler inserts for inline mode.
>
> Signed-off-by: Daniel Axtens 

Reviewed-by: Marco Elver 


> ---
>  lib/Kconfig.kasan | 14 ++
>  1 file changed, 14 insertions(+)
>
> diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
> index cffc2ebbf185..cb5e02d09e11 100644
> --- a/lib/Kconfig.kasan
> +++ b/lib/Kconfig.kasan
> @@ -12,6 +12,15 @@ config HAVE_ARCH_KASAN_HW_TAGS
>  config HAVE_ARCH_KASAN_VMALLOC
> bool
>
> +config ARCH_DISABLE_KASAN_INLINE
> +   bool
> +   help
> + Sometimes an architecture might not be able to support inline
> + instrumentation but might be able to support outline 
> instrumentation.
> + This option allows an architecture to prevent inline and stack
> + instrumentation from being enabled.
> +
> +
>  config CC_HAS_KASAN_GENERIC
> def_bool $(cc-option, -fsanitize=kernel-address)
>
> @@ -130,6 +139,7 @@ config KASAN_OUTLINE
>
>  config KASAN_INLINE
> bool "Inline instrumentation"
> +   depends on !ARCH_DISABLE_KASAN_INLINE
> help
>   Compiler directly inserts code checking shadow memory before
>   memory accesses. This is faster than outline (in some workloads
> @@ -141,6 +151,7 @@ endchoice
>  config KASAN_STACK
> bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && 
> !COMPILE_TEST
> depends on KASAN_GENERIC || KASAN_SW_TAGS
> +   depends on !ARCH_DISABLE_KASAN_INLINE
> default y if CC_IS_GCC
> help
>   The LLVM stack address sanitizer has a know problem that
> @@ -154,6 +165,9 @@ config KASAN_STACK
>   but clang users can still enable it for builds without
>   CONFIG_COMPILE_TEST.  On gcc it is assumed to always be safe
>   to use and enabled by default.
> + If the architecture disables inline instrumentation, this is
> + also disabled as it adds inline-style instrumentation that
> + is run unconditionally.
>
>  config KASAN_SW_TAGS_IDENTIFY
> bool "Enable memory corruption identification"
> --
> 2.30.2
>
> --
> You received this message because you are subscribed to the Google Groups 
> "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kasan-dev+unsubscr...@googlegroups.com.
> To view this discussion on the web visit 
> https://groups.google.com/d/msgid/kasan-dev/20210617063956.94061-2-dja%40axtens.net.

Re: [PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Marco Elver

On Thu, 17 Jun 2021 at 08:40, Daniel Axtens  wrote:
>
> Allow architectures to define a kasan_arch_is_ready() hook that bails
> out of any function that's about to touch the shadow unless the arch
> says that it is ready for the memory to be accessed. This is fairly
> uninvasive and should have a negligible performance penalty.
>
> This will only work in outline mode, so an arch must specify
> ARCH_DISABLE_KASAN_INLINE if it requires this.
>
> Cc: Balbir Singh 
> Cc: Aneesh Kumar K.V 
> Suggested-by: Christophe Leroy 
> Signed-off-by: Daniel Axtens 

With Christophe's suggestion:

Reviewed-by: Marco Elver 


> --
>
> Both previous RFCs for ppc64 - by 2 different people - have
> needed this trick! See:
>  - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
>  - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series
>
> I haven't been able to exercise the arch hook error for !GENERIC as I
> don't have a particularly modern aarch64 toolchain or a lot of experience
> cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.
> ---
>  mm/kasan/common.c  | 4 
>  mm/kasan/generic.c | 3 +++
>  mm/kasan/kasan.h   | 8 
>  mm/kasan/shadow.c  | 8 
>  4 files changed, 23 insertions(+)
>
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index 10177cc26d06..0ad615f3801d 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -331,6 +331,10 @@ static inline bool kasan_slab_free(struct kmem_cache 
> *cache, void *object,
> u8 tag;
> void *tagged_object;
>
> +   /* Bail if the arch isn't ready */
> +   if (!kasan_arch_is_ready())
> +   return false;
> +
> tag = get_tag(object);
> tagged_object = object;
> object = kasan_reset_tag(object);
> diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
> index 53cbf28859b5..c3f5ba7a294a 100644
> --- a/mm/kasan/generic.c
> +++ b/mm/kasan/generic.c
> @@ -163,6 +163,9 @@ static __always_inline bool check_region_inline(unsigned 
> long addr,
> size_t size, bool write,
> unsigned long ret_ip)
>  {
> +   if (!kasan_arch_is_ready())
> +   return true;
> +
> if (unlikely(size == 0))
> return true;
>
> diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
> index 8f450bc28045..b18abaf8c78e 100644
> --- a/mm/kasan/kasan.h
> +++ b/mm/kasan/kasan.h
> @@ -449,6 +449,14 @@ static inline void kasan_poison_last_granule(const void 
> *address, size_t size) {
>
>  #endif /* CONFIG_KASAN_GENERIC */
>
> +#ifndef kasan_arch_is_ready
> +static inline bool kasan_arch_is_ready(void)   { return true; }
> +#else
> +#if !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
> +#error kasan_arch_is_ready only works in KASAN generic outline mode!
> +#endif
> +#endif
> +
>  /*
>   * Exported functions for interfaces called from assembly or from generated
>   * code. Declarations here to avoid warning about missing declarations.
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index 082ee5b6d9a1..3c7f7efe6f68 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -73,6 +73,10 @@ void kasan_poison(const void *addr, size_t size, u8 value, 
> bool init)
>  {
> void *shadow_start, *shadow_end;
>
> +   /* Don't touch the shadow memory if arch isn't ready */
> +   if (!kasan_arch_is_ready())
> +   return;
> +
> /*
>  * Perform shadow offset calculation based on untagged address, as
>  * some of the callers (e.g. kasan_poison_object_data) pass tagged
> @@ -99,6 +103,10 @@ EXPORT_SYMBOL(kasan_poison);
>  #ifdef CONFIG_KASAN_GENERIC
>  void kasan_poison_last_granule(const void *addr, size_t size)
>  {
> +   /* Don't touch the shadow memory if arch isn't ready */
> +   if (!kasan_arch_is_ready())
> +   return;
> +
> if (size & KASAN_GRANULE_MASK) {
> u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
> *shadow = size & KASAN_GRANULE_MASK;
> --
> 2.30.2
>

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V


On 6/17/21 1:16 PM, David Gibson wrote:

On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


...


It's weird to me that you'd want to consider them in different nodes
for those different purposes.



--
   |NUMA node0 |
   |ProcA -> MEMA  |
   | | |
   || |
   |---> PMEMB|
   |   |
---

---
   |NUMA node1 |
   |   |
   |ProcB ---> MEMC|
   || |
   |---> PMEMD|
   |   |
   |   |
---
  


For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media.


I'm still not understanding why the latency we care about is different
in the two cases.  Can you give an example of when this would result
in different actual node assignments for the two different cases?



In the above example in order allow use of PMEMB and PMEMD as memory 
only NUMA nodes

we need platform to represent them in its own domainID. Let's assume that
platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have 
associativity array like below


{ 4, 6, 0}  -> PROCA/MEMA
{ 4, 6, 40} -> PMEMB
{ 4, 6, 41} -> PMEMD
{ 4, 6, 1} ->  PROCB/MEMB

When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, 
we essentially look for the first nearest online node. Which means both 
PMEMB and PMEMD will appear as devices attached to node0. That is not 
ideal for for many applications.


using secondary domainID index as explained here helps to associate
each PMEM device to the right group. On a non virtualized config or hard 
partitioned config such a device tree representation can be looked at as 
a hint to identify which socket the actual device is connected to.


-aneesh

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Daniel Henrique Barboza





On 6/17/21 4:46 AM, David Gibson wrote:

On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:


On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:

FORM2 introduce a concept of secondary domain which is identical to the
conceept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. For DAX kmem use the logical domain
id introduced in FORM2. This new numa node

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/mm/numa.c| 28 +++
  arch/powerpc/platforms/pseries/papr_scm.c | 26 +
  arch/powerpc/platforms/pseries/pseries.h  |  1 +
  3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 86cd2af014f7..b9ac6d02e944 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
*associativity)
return nid;
  }
  
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)

+{
+   int secondary_index;
+   const __be32 *associativity;
+
+   if (!numa_enabled) {
+   *primary = NUMA_NO_NODE;
+   *secondary = NUMA_NO_NODE;
+   return 0;
+   }
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return -ENODEV;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   *primary = of_read_number([primary_domain_index], 
1);
+   secondary_index = of_read_number(_ref_points[1], 1);


Secondary ID is always the second reference point, but primary depends
on the length of resources?  That seems very weird.


primary_domain_index is distance_ref_point[0]. With Form2 we would find
both primary and secondary domain ID same for all resources other than
persistent memory device. The usage w.r.t. persistent memory is
explained in patch 7.


Right, I misunderstood



With Form2 the primary domainID and secondary domainID are used to identify the 
NUMA nodes
the kernel should use when using persistent memory devices.


This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.


They are essentially domainIDs. The interpretation of them are different
between Form1 and Form2. Hence I kept referring to them as primary and
secondary domainID. Any suggestion on what to name them with Form2?


My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.



I agree that this reuse can be confusing. I could argue that there is
precedent for that in PAPR - FORM0 puts a different spin on the same
property as well - but there is no need to keep following existing PAPR
practices in new spec (and some might argue it's best not to).

As far as QEMU goes, renaming this property to "numa-associativity-mode"
(just an example) is a quick change to do since we separated FORM1 and FORM2
code over there.

Doing such a rename can also help with the issue of having to describe new
FORM2 semantics using "least significant boundary" or "primary domain" or
any FORM0|FORM1 related terminology.


Thanks,


Daniel






Persistent memory devices
can also be used as regular memory using DAX KMEM driver and primary domainID 
indicates
the numa node number OS should use when using these devices as regular memory. 
Secondary
domainID is the numa node number that should be used when using this device as
persistent memory.


It's weird to me that you'd want to consider them in different nodes
for those different purposes.



--
   |NUMA node0 |
   |ProcA -> MEMA  |
   | | |
   || |
   |---> PMEMB|
   |   |
---

---
   |NUMA node1 |
   |   |
   |ProcB ---> MEMC|
   || |
   |---> PMEMD|
   |   |
   |   |
---
  


For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax

Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V


On 6/17/21 1:20 PM, David Gibson wrote:

On Tue, Jun 15, 2021 at 01:10:27PM +0530, Aneesh Kumar K.V wrote:

David Gibson  writes:






PAPR defines "most significant" as below

When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, 
the “ibm,associativ-
ity-reference-points” property indicates boundaries between associativity 
domains presented by the
“ibm,associativity” property containing “near” and “far” resources. The
first such boundary in the list represents the 1 based ordinal in the
associativity lists of the most significant boundary, with subsequent
entries indicating progressively less significant boundaries


No... that's not a definition.  Like your draft PAPR uses the term
while entirely failing to define it.  From what I can tell about how
it is used the "most significant" boundary corresponds to what Linux
simply thinks of as the node id.  But intuitively, I'd think of that
as the "least significant" boundary, since that's basically the
smallest granularity at which we care about NUMA distances.



I would interpret it as the boundary where we start defining NUMA
nodes.


That isn't any clearer to me.


How about calling it least significant boundary then?


Heck, no.  My whole point here is that the meaning is unclear: my
first guess at the meaning is different from whoever wrote that text.
We need to come up with a way of describing it that's clearer.


The “ibm,associativity-reference-points” property contains one or more list of 
numbers
(domainID index) that represents the 1 based ordinal in the associativity lists 
of the
least significant boundary, with subsequent entries indicating progressively 
higher
significant boundaries.

ex:
{ primary domainID index, secondary domainID index, tertiary domainID index.. }

Linux kernel uses the domainID of the least significant boundary (aka primary 
domain)
as the NUMA node id. Linux kernel computes NUMA distance between two domains by
recursively comparing if they belong to the same higher-level domains. For 
mismatch
at every higher level of the resource group, the kernel doubles the NUMA 
distance between
the comparing domains.





Any suggestion on how to reword the above section then? We could say
associativity-reference-points is list of domainID index representing 
increasing hierarchy of resource group. I am not sure that explains it 
any better?




For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
numa distance. Instead ibm,numa-lookup-index-table allows us to present
the same in a 3x3 matrix  distance[index0][index1] is the  distance
between NUMA node 0 and 4 and distance[index0][index2] is the distance
between NUMA node 0 and 5


Right, I get the purpose of it, and I realized I misphrashed my
question.  My point is that in a Form2 world, the *only* thing the
associativity array is used for is to deduce its position in
lookup-index-table.  Once you have have that for each resource, you
have everything you need, yes?



ibm,associativity is used find the domainID/NUMA node id of the
resource.

ibm,lookup-index-table is used compute the distance information between
NUMA nodes using ibm,numa-distance-table.


I get that you need to use lookup-index-table to work out how to
interpret numa-distance-table.  My point is that IIUC once you've done
the lookup in lookup-index-table once for each associativity array
value, the number you get out (which just a compacted version of the
node id) should be all you need ever again.



That is correct. We will continue to use the index to nodeid map during 
DLPAR, if such an operation adds a new numa node. update_numa_distance() 
shows the detail.


-aneesh

[PATCH 1/1] ASoC: fsl: remove unnecessary oom message

2021-06-17 Thread Zhen Lei

Fixes scripts/checkpatch.pl warning:
WARNING: Possible unnecessary 'out of memory' message

Remove it can help us save a bit of memory.

Signed-off-by: Zhen Lei 
---
 sound/soc/fsl/imx-audmix.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/fsl/imx-audmix.c b/sound/soc/fsl/imx-audmix.c
index cbdc0a2c09c5..a364e2415de0 100644
--- a/sound/soc/fsl/imx-audmix.c
+++ b/sound/soc/fsl/imx-audmix.c
@@ -209,10 +209,8 @@ static int imx_audmix_probe(struct platform_device *pdev)
 
/* for CPU/Codec/Platform x 2 */
dlc = devm_kcalloc(>dev, 6, sizeof(*dlc), GFP_KERNEL);
-   if (!dlc) {
-   dev_err(>dev, "failed to allocate dai_link\n");
+   if (!dlc)
return -ENOMEM;
-   }
 
ret = of_parse_phandle_with_args(audmix_np, "dais", NULL, i,
 );
-- 
2.25.1

[PATCH 1/1] ALSA: aoa: remove unnecessary oom message

2021-06-17 Thread Zhen Lei

Fixes scripts/checkpatch.pl warning:
WARNING: Possible unnecessary 'out of memory' message

Remove it can help us save a bit of memory.

Signed-off-by: Zhen Lei 
---
 sound/aoa/soundbus/i2sbus/pcm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/aoa/soundbus/i2sbus/pcm.c b/sound/aoa/soundbus/i2sbus/pcm.c
index 1c8e8131a716..a9e502a6cdeb 100644
--- a/sound/aoa/soundbus/i2sbus/pcm.c
+++ b/sound/aoa/soundbus/i2sbus/pcm.c
@@ -918,10 +918,8 @@ i2sbus_attach_codec(struct soundbus_dev *dev, struct 
snd_card *card,
}
 
cii = kzalloc(sizeof(struct codec_info_item), GFP_KERNEL);
-   if (!cii) {
-   printk(KERN_DEBUG "i2sbus: failed to allocate cii\n");
+   if (!cii)
return -ENOMEM;
-   }
 
/* use the private data to point to the codec info */
cii->sdev = soundbus_dev_get(dev);
-- 
2.25.1

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread David Gibson

On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson  writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> >> when using persistent memory device. For DAX kmem use the logical domain
> >> >> id introduced in FORM2. This new numa node
> >> >> 
> >> >> Signed-off-by: Aneesh Kumar K.V 
> >> >> ---
> >> >>  arch/powerpc/mm/numa.c| 28 +++
> >> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +
> >> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> >> --- a/arch/powerpc/mm/numa.c
> >> >> +++ b/arch/powerpc/mm/numa.c
> >> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
> >> >> *associativity)
> >> >> return nid;
> >> >>  }
> >> >>  
> >> >> +int get_primary_and_secondary_domain(struct device_node *node, int 
> >> >> *primary, int *secondary)
> >> >> +{
> >> >> +   int secondary_index;
> >> >> +   const __be32 *associativity;
> >> >> +
> >> >> +   if (!numa_enabled) {
> >> >> +   *primary = NUMA_NO_NODE;
> >> >> +   *secondary = NUMA_NO_NODE;
> >> >> +   return 0;
> >> >> +   }
> >> >> +
> >> >> +   associativity = of_get_associativity(node);
> >> >> +   if (!associativity)
> >> >> +   return -ENODEV;
> >> >> +
> >> >> +   if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> >> +   *primary = 
> >> >> of_read_number([primary_domain_index], 1);
> >> >> +   secondary_index = 
> >> >> of_read_number(_ref_points[1], 1);
> >> >
> >> > Secondary ID is always the second reference point, but primary depends
> >> > on the length of resources?  That seems very weird.
> >> 
> >> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> >> both primary and secondary domain ID same for all resources other than
> >> persistent memory device. The usage w.r.t. persistent memory is
> >> explained in patch 7.
> >
> > Right, I misunderstood
> >
> >> 
> >> With Form2 the primary domainID and secondary domainID are used to 
> >> identify the NUMA nodes
> >> the kernel should use when using persistent memory devices.
> >
> > This seems kind of bogus.  With Form1, the primary/secondary ID are a
> > sort of heirarchy of distance (things with same primary ID are very
> > close, things with same secondary are kinda-close, etc.).  With Form2,
> > it's referring to their effective node for different purposes.
> >
> > Using the same terms for different meanings seems unnecessarily
> > confusing.
> 
> They are essentially domainIDs. The interpretation of them are different
> between Form1 and Form2. Hence I kept referring to them as primary and
> secondary domainID. Any suggestion on what to name them with Form2?

My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.

> >> Persistent memory devices
> >> can also be used as regular memory using DAX KMEM driver and primary 
> >> domainID indicates
> >> the numa node number OS should use when using these devices as regular 
> >> memory. Secondary
> >> domainID is the numa node number that should be used when using this 
> >> device as
> >> persistent memory.
> >
> > It's weird to me that you'd want to consider them in different nodes
> > for those different purposes.
> 
> 
>--
>   |NUMA node0 |
>   |ProcA -> MEMA  |
>   | | |
>   |   | |
>   |   ---> PMEMB|
>   |   |
>---
> 
>---
>   |NUMA node1 |
>   |   |
>   |ProcB ---> MEMC|
>   |   | |
>   |   ---> PMEMD|
>   |   |
>   |   |
>---
>  
> 
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> we want to use it as memory using dax kmem driver, we

Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-17 Thread David Gibson

On Tue, Jun 15, 2021 at 01:10:27PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson  writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> >> Signed-off-by: Daniel Henrique Barboza 
> >> >> Signed-off-by: Aneesh Kumar K.V 
> >> >> ---
> >> >>  Documentation/powerpc/associativity.rst   | 139 
> >> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >> >>  arch/powerpc/mm/numa.c| 149 +-
> >> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> >> 
> >> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> >> b/Documentation/powerpc/associativity.rst
> >> >> new file mode 100644
> >> >> index ..58abedea81d7
> >> >> --- /dev/null
> >> >> +++ b/Documentation/powerpc/associativity.rst
> >> >> @@ -0,0 +1,139 @@
> >> >> +
> >> >> +NUMA resource associativity
> >> >> +=
> >> >> +
> >> >> +Associativity represents the groupings of the various platform 
> >> >> resources into
> >> >> +domains of substantially similar mean performance relative to 
> >> >> resources outside
> >> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> >> +performance relative to each other than relative to other resources 
> >> >> subsets
> >> >> +are represented as being members of a sub-grouping domain. This 
> >> >> performance
> >> >> +characteristic is presented in terms of NUMA node distance within the 
> >> >> Linux kernel.
> >> >> +From the platform view, these groups are also referred to as domains.
> >> >> +
> >> >> +PAPR interface currently supports two different ways of communicating 
> >> >> these resource
> >> >
> >> > You describe form 2 below as well, which contradicts this.
> >> 
> >> Fixed as below.
> >> 
> >> PAPR interface currently supports different ways of communicating these 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0, Form 1 and 
> >> Form2
> >> associativity grouping. Form 0 is the older format and is now considered 
> >> deprecated.
> >> 
> >> Hypervisor indicates the type/form of associativity used via 
> >> "ibm,arcitecture-vec-5 property".
> >> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> >> associativity
> >> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >
> > LGTM.
> >
> >> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 
> >> >> associativity grouping.
> >> >> +Form 0 is the older format and is now considered deprecated.
> >> >> +
> >> >> +Hypervisor indicates the type/form of associativity used via 
> >> >> "ibm,arcitecture-vec-5 property".
> >> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates 
> >> >> usage of Form 0 or Form 1.
> >> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> >> +
> >> >> +Form 0
> >> >> +-
> >> >> +Form 0 associativity supports only two NUMA distance (LOCAL and 
> >> >> REMOTE).
> >> >> +
> >> >> +Form 1
> >> >> +-
> >> >> +With Form 1 a combination of ibm,associativity-reference-points and 
> >> >> ibm,associativity
> >> >> +device tree properties are used to determine the NUMA distance between 
> >> >> resource groups/domains. 
> >> >> +
> >> >> +The “ibm,associativity” property contains one or more lists of numbers 
> >> >> (domainID)
> >> >> +representing the resource’s platform grouping domains.
> >> >> +
> >> >> +The “ibm,associativity-reference-points” property contains one or more 
> >> >> list of numbers
> >> >> +(domain index) that represents the 1 based ordinal in the 
> >> >> associativity lists of the most
> >> >> +significant boundary, with subsequent entries indicating progressively 
> >> >> less significant boundaries.
> >> >> +
> >> >> +Linux kernel uses the domain id of the most significant boundary (aka 
> >> >> primary domain)
> >> >
> >> > I thought we used the *least* significant boundary (the smallest
> >> > grouping, not the largest).  That is, the last index, not the first.
> >> >
> >> > Actually... come to think of it, I'm not even sure how to interpret
> >> > "most significant".  Does that mean a change in grouping at that "most
> >> > significant" level results in the largest perfomance difference?
> >> 
> >> PAPR defines "most significant" as below
> >> 
> >> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of 
> >> one, the “ibm,associativ-
> >> ity-reference-points” property indicates boundaries between associativity 
> >>

[PATCH v15 4/4] kasan: use MAX_PTRS_PER_* for early shadow tables

2021-06-17 Thread Daniel Axtens

powerpc has a variable number of PTRS_PER_*, set at runtime based
on the MMU that the kernel is booted under.

This means the PTRS_PER_* are no longer constants, and therefore
breaks the build. Switch to using MAX_PTRS_PER_*, which are constant.

Suggested-by: Christophe Leroy 
Suggested-by: Balbir Singh 
Reviewed-by: Christophe Leroy 
Reviewed-by: Balbir Singh 
Reviewed-by: Marco Elver 
Signed-off-by: Daniel Axtens 
---
 include/linux/kasan.h | 6 +++---
 mm/kasan/init.c   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 768d7d342757..5310e217bd74 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -41,9 +41,9 @@ struct kunit_kasan_expectation {
 #endif
 
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
-extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
-extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
+extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
 extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 
 int kasan_populate_early_shadow(const void *shadow_start,
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 348f31d15a97..cc64ed6858c6 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
 static inline bool kasan_pud_table(p4d_t p4d)
 {
return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
 static inline bool kasan_pmd_table(pud_t pud)
 {
return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
return false;
 }
 #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
__page_aligned_bss;
 
 static inline bool kasan_pte_table(pmd_t pmd)
-- 
2.30.2

[PATCH v15 3/4] mm: define default MAX_PTRS_PER_* in include/pgtable.h

2021-06-17 Thread Daniel Axtens

Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
constant for cases which need a compile-time constant (e.g. fixed-size
arrays).

powerpc likewise has boot-time selectable MMU features which can cause
other mm "constants" to vary. For KASAN, we have some static
PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions
in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless
an architecture has defined MAX_PTRS_PER_x in its arch headers.

Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
we're at it: both can just pick up the default now.

Reviewed-by: Christophe Leroy 
Reviewed-by: Marco Elver 
Signed-off-by: Daniel Axtens 

---

s390 was compile tested only.
---
 arch/s390/include/asm/pgtable.h |  2 --
 include/asm-generic/pgtable-nop4d.h |  1 -
 include/linux/pgtable.h | 22 ++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7c66ae5d7e32..cf05954ce013 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -342,8 +342,6 @@ static inline int is_module_addr(void *addr)
 #define PTRS_PER_P4D   _CRST_ENTRIES
 #define PTRS_PER_PGD   _CRST_ENTRIES
 
-#define MAX_PTRS_PER_P4D   PTRS_PER_P4D
-
 /*
  * Segment table and region3 table entry encoding
  * (R = read-only, I = invalid, y = young bit):
diff --git a/include/asm-generic/pgtable-nop4d.h 
b/include/asm-generic/pgtable-nop4d.h
index ce2cbb3c380f..2f6b1befb129 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -9,7 +9,6 @@
 typedef struct { pgd_t pgd; } p4d_t;
 
 #define P4D_SHIFT  PGDIR_SHIFT
-#define MAX_PTRS_PER_P4D   1
 #define PTRS_PER_P4D   1
 #define P4D_SIZE   (1UL << P4D_SHIFT)
 #define P4D_MASK   (~(P4D_SIZE-1))
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9e6f71265f72..69700e3e615f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1625,4 +1625,26 @@ typedef unsigned int pgtbl_mod_mask;
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
 
+/*
+ * Some architectures have MMUs that are configurable or selectable at boot
+ * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
+ * helps to have a static maximum value.
+ */
+
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
+#ifndef MAX_PTRS_PER_P4D
+#define MAX_PTRS_PER_P4D PTRS_PER_P4D
+#endif
+
 #endif /* _LINUX_PGTABLE_H */
-- 
2.30.2

[PATCH v15 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Daniel Axtens

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Reviewed-by: Marco Elver 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
 - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
 - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

Build tested on arm64 with SW_TAGS and x86 with INLINE: the error fires
if I add a kasan_arch_is_ready define.
---
 mm/kasan/common.c  | 4 
 mm/kasan/generic.c | 3 +++
 mm/kasan/kasan.h   | 6 ++
 mm/kasan/shadow.c  | 8 
 4 files changed, 21 insertions(+)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 10177cc26d06..0ad615f3801d 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -331,6 +331,10 @@ static inline bool kasan_slab_free(struct kmem_cache 
*cache, void *object,
u8 tag;
void *tagged_object;
 
+   /* Bail if the arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return false;
+
tag = get_tag(object);
tagged_object = object;
object = kasan_reset_tag(object);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 53cbf28859b5..c3f5ba7a294a 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -163,6 +163,9 @@ static __always_inline bool check_region_inline(unsigned 
long addr,
size_t size, bool write,
unsigned long ret_ip)
 {
+   if (!kasan_arch_is_ready())
+   return true;
+
if (unlikely(size == 0))
return true;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8f450bc28045..4dbc8def64f4 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -449,6 +449,12 @@ static inline void kasan_poison_last_granule(const void 
*address, size_t size) {
 
 #endif /* CONFIG_KASAN_GENERIC */
 
+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void)   { return true; }
+#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+
 /*
  * Exported functions for interfaces called from assembly or from generated
  * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 082ee5b6d9a1..3c7f7efe6f68 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -73,6 +73,10 @@ void kasan_poison(const void *addr, size_t size, u8 value, 
bool init)
 {
void *shadow_start, *shadow_end;
 
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
/*
 * Perform shadow offset calculation based on untagged address, as
 * some of the callers (e.g. kasan_poison_object_data) pass tagged
@@ -99,6 +103,10 @@ EXPORT_SYMBOL(kasan_poison);
 #ifdef CONFIG_KASAN_GENERIC
 void kasan_poison_last_granule(const void *addr, size_t size)
 {
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
*shadow = size & KASAN_GRANULE_MASK;
-- 
2.30.2

[PATCH v15 1/4] kasan: allow an architecture to disable inline instrumentation

2021-06-17 Thread Daniel Axtens

For annoying architectural reasons, it's very difficult to support inline
instrumentation on powerpc64.*

Add a Kconfig flag to allow an arch to disable inline. (It's a bit
annoying to be 'backwards', but I'm not aware of any way to have
an arch force a symbol to be 'n', rather than 'y'.)

We also disable stack instrumentation in this case as it does things that
are functionally equivalent to inline instrumentation, namely adding
code that touches the shadow directly without going through a C helper.

* on ppc64 atm, the shadow lives in virtual memory and isn't accessible in
real mode. However, before we turn on virtual memory, we parse the device
tree to determine which platform and MMU we're running under. That calls
generic DT code, which is instrumented. Inline instrumentation in DT would
unconditionally attempt to touch the shadow region, which we won't have
set up yet, and would crash. We can make outline mode wait for the arch to
be ready, but we can't change what the compiler inserts for inline mode.

Reviewed-by: Marco Elver 
Signed-off-by: Daniel Axtens 
---
 lib/Kconfig.kasan | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index cffc2ebbf185..cb5e02d09e11 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -12,6 +12,15 @@ config HAVE_ARCH_KASAN_HW_TAGS
 config HAVE_ARCH_KASAN_VMALLOC
bool
 
+config ARCH_DISABLE_KASAN_INLINE
+   bool
+   help
+ Sometimes an architecture might not be able to support inline
+ instrumentation but might be able to support outline instrumentation.
+ This option allows an architecture to prevent inline and stack
+ instrumentation from being enabled.
+
+
 config CC_HAS_KASAN_GENERIC
def_bool $(cc-option, -fsanitize=kernel-address)
 
@@ -130,6 +139,7 @@ config KASAN_OUTLINE
 
 config KASAN_INLINE
bool "Inline instrumentation"
+   depends on !ARCH_DISABLE_KASAN_INLINE
help
  Compiler directly inserts code checking shadow memory before
  memory accesses. This is faster than outline (in some workloads
@@ -141,6 +151,7 @@ endchoice
 config KASAN_STACK
bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && 
!COMPILE_TEST
depends on KASAN_GENERIC || KASAN_SW_TAGS
+   depends on !ARCH_DISABLE_KASAN_INLINE
default y if CC_IS_GCC
help
  The LLVM stack address sanitizer has a know problem that
@@ -154,6 +165,9 @@ config KASAN_STACK
  but clang users can still enable it for builds without
  CONFIG_COMPILE_TEST.  On gcc it is assumed to always be safe
  to use and enabled by default.
+ If the architecture disables inline instrumentation, this is
+ also disabled as it adds inline-style instrumentation that
+ is run unconditionally.
 
 config KASAN_SW_TAGS_IDENTIFY
bool "Enable memory corruption identification"
-- 
2.30.2

[PATCH v15 0/4] KASAN core changes for ppc64 radix KASAN

2021-06-17 Thread Daniel Axtens

Building on the work of Christophe, Aneesh and Balbir, I've ported
KASAN to 64-bit Book3S kernels running on the Radix MMU. I've been
trying this for a while, but we keep having collisions between the
kasan code in the mm tree and the code I want to put in to the ppc
tree.

This series just contains the kasan core changes that we need. These
can go in via the mm tree. I will then propose the powerpc changes for
a later cycle. (The most recent RFC for the powerpc changes is in the
v12 series at
https://lore.kernel.org/linux-mm/20210615014705.2234866-1-...@axtens.net/
)

v15 applies to next-20210611. There should be no noticeable changes to
other platforms.

Changes since v14: Included a bunch of Reviewed-by:s, thanks
Christophe and Marco. Cleaned up the build time error #ifdefs, thanks
Christophe.

Changes since v13: move the MAX_PTR_PER_* definitions out of kasan and
into pgtable.h. Add a build time error to hopefully prevent any
confusion about when the new hook is applicable. Thanks Marco and
Christophe.

Changes since v12: respond to Marco's review comments - clean up the
help for ARCH_DISABLE_KASAN_INLINE, and add an arch readiness check to
the new granule poisioning function. Thanks Marco.

Daniel Axtens (4):
  kasan: allow an architecture to disable inline instrumentation
  kasan: allow architectures to provide an outline readiness check
  mm: define default MAX_PTRS_PER_* in include/pgtable.h
  kasan: use MAX_PTRS_PER_* for early shadow tables

 arch/s390/include/asm/pgtable.h |  2 --
 include/asm-generic/pgtable-nop4d.h |  1 -
 include/linux/kasan.h   |  6 +++---
 include/linux/pgtable.h | 22 ++
 lib/Kconfig.kasan   | 14 ++
 mm/kasan/common.c   |  4 
 mm/kasan/generic.c  |  3 +++
 mm/kasan/init.c |  6 +++---
 mm/kasan/kasan.h|  6 ++
 mm/kasan/shadow.c   |  8 
 10 files changed, 63 insertions(+), 9 deletions(-)

-- 
2.30.2

Re: [PATCH] selftests/powerpc: Add a test of sigreturn vs VDSO

2021-06-17 Thread Christophe Leroy





Le 26/03/2020 à 13:06, Michael Ellerman a écrit :

On Wed, 2020-03-04 at 11:04:02 UTC, Michael Ellerman wrote:

There's two different paths through the sigreturn code, depending on
whether the VDSO is mapped or not. We recently discovered a bug in the
unmapped case, because it's not commonly used these days.

So add a test that sends itself a signal, then moves the VDSO, takes
another signal and finally unmaps the VDSO before sending itself
another signal. That tests the standard signal path, the code that
handles the VDSO being moved, and also the signal path in the case
where the VDSO is unmapped.

Signed-off-by: Michael Ellerman 


Applied to powerpc next.

https://git.kernel.org/powerpc/c/a0968a025c04702427a4aee2c618f451a5098cd8

cheers



Doesn't work anymore since the split of VDSO and VVAR.

Christophe

Re: [PATCH 04/11] crypto: marvell: cesa: change FPGA indirect article to an

2021-06-17 Thread Herbert Xu

On Tue, Jun 08, 2021 at 02:23:43PM -0700, t...@redhat.com wrote:
> From: Tom Rix 
> 
> Change use of 'a fpga' to 'an fpga'
> 
> Signed-off-by: Tom Rix 
> ---
>  drivers/crypto/marvell/cesa/cesa.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[PATCH v3 2/8] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..132813dd1a6c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+static int max_associativity_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 
int i, index;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
index = be32_to_cpu(distance_ref_points[i]);
if (cpu1_assoc[index] == cpu2_assoc[index])
break;
@@ -193,7 +193,7 @@ int __node_distance(int a, int b)
if (!form1_affinity)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
break;
 
@@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
if (!form1_affinity)
return;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = [be32_to_cpu(distance_ref_points[i]) - 1];
@@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
nid = NUMA_NO_NODE;
 
if (nid > 0 &&
-   of_read_number(associativity, 1) >= distance_ref_points_depth) {
+   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
/*
 * Skip the length field and send start of associativity array
 */
@@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
 */
distance_ref_points = of_get_property(root,
"ibm,associativity-reference-points",
-   _ref_points_depth);
+   _associativity_domain_index);
 
if (!distance_ref_points) {
dbg("NUMA: ibm,associativity-reference-points not found.\n");
goto err;
}
 
-   distance_ref_points_depth /= sizeof(int);
+   max_associativity_domain_index /= sizeof(int);
 
if (firmware_has_feature(FW_FEATURE_OPAL) ||
firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
@@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
if (form1_affinity) {
index = of_read_number(distance_ref_points, 1);
} else {
-   if (distance_ref_points_depth < 2) {
+   if (max_associativity_domain_index < 2) {
printk(KERN_WARNING "NUMA: "
"short ibm,associativity-reference-points\n");
goto err;
@@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
 * Warn and cap if the hardware supports more than
 * MAX_DISTANCE_REF_POINTS domains.
 */
-   if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+   if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
printk(KERN_WARNING "NUMA: distance array capped at "
"%d entries\n", MAX_DISTANCE_REF_POINTS);
-   distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+   max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
}
 
of_node_put(root);
-- 
2.31.1

[PATCH v3 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V

FORM2 introduce a concept of secondary domain which is identical to the
concept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. With DAX kmem kernel can use the
pimary domainID introduced in Form2. More details can be found in
patch "powerpc/pseries: Add support for FORM2 associativity"

Cc: nvd...@lists.linux.dev
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c| 28 +++
 arch/powerpc/platforms/pseries/papr_scm.c | 26 +
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5a7d94960fb7..cd3ae7ff77ac 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
*associativity)
return nid;
 }
 
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, 
int *secondary)
+{
+   int secondary_index;
+   const __be32 *associativity;
+
+   if (!numa_enabled) {
+   *primary = NUMA_NO_NODE;
+   *secondary = NUMA_NO_NODE;
+   return 0;
+   }
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return -ENODEV;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   *primary = of_read_number([primary_domain_index], 
1);
+   secondary_index = of_read_number(_ref_points[1], 1);
+   *secondary = of_read_number([secondary_index], 1);
+   }
+   if (*primary == 0x || *primary >= nr_node_ids)
+   *primary = NUMA_NO_NODE;
+
+   if (*secondary == 0x || *secondary >= nr_node_ids)
+   *secondary = NUMA_NO_NODE;
+   return 0;
+}
+
 /* Returns the nid associated with the given device tree node,
  * or -1 if not found.
  */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index ef26fe40efb0..9bf2f1f3ddc5 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include "pseries.h"
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
 struct papr_scm_priv {
struct platform_device *pdev;
struct device_node *dn;
+   int numa_node;
+   int target_node;
uint32_t drc_index;
uint64_t blocks;
uint64_t block_size;
@@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
struct nd_mapping_desc mapping;
struct nd_region_desc ndr_desc;
unsigned long dimm_flags;
-   int target_nid, online_nid;
ssize_t stat_size;
 
p->bus_desc.ndctl = papr_scm_ndctl;
@@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
 
memset(_desc, 0, sizeof(ndr_desc));
-   target_nid = dev_to_node(>pdev->dev);
-   online_nid = numa_map_to_online_node(target_nid);
-   ndr_desc.numa_node = online_nid;
-   ndr_desc.target_node = target_nid;
+   ndr_desc.numa_node = p->numa_node;
+   ndr_desc.target_node = p->target_node;
ndr_desc.res = >res;
ndr_desc.of_node = p->dn;
ndr_desc.provider_data = p;
@@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
ndr_desc.res, p->dn);
goto err;
}
-   if (target_nid != online_nid)
-   dev_info(dev, "Region registered with target node %d and online 
node %d",
-target_nid, online_nid);
 
mutex_lock(_ndr_lock);
list_add_tail(>region_list, _nd_regions);
@@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
struct papr_scm_priv *p;
const char *uuid_str;
u64 uuid[2];
-   int rc;
+   int rc, numa_node;
 
/* check we have all the required DT properties */
if (of_property_read_u32(dn, "ibm,my-drc-index", _index)) {
@@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
return -ENODEV;
}
 
-
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
 
+   if (get_primary_and_secondary_domain(dn, >target_node, _node)) {
+   dev_err(>dev, "%pOF: missing NUMA attributes!\n", dn);
+   rc = -ENODEV;
+   goto err;
+   }
+   p->numa_node = numa_map_to_online_node(numa_node);
+   if (numa_node != p->numa_node)
+   dev_info(>dev, "Region registered with online node %d and 
device tree node %d",
+p->numa_node, numa_node);
+
/* Initialize the dimm mutex */
mutex_init(>health_mutex);
 
diff --git

[PATCH v3 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V

PAPR interface currently supports two different ways of communicating resource
grouping details to the OS. These are referred to as Form 0 and Form 1
associativity grouping. Form 0 is the older format and is now considered
deprecated. This patch adds another resource grouping named FORM2.

Signed-off-by: Daniel Henrique Barboza 
Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/powerpc/associativity.rst   | 177 ++
 arch/powerpc/include/asm/firmware.h   |   3 +-
 arch/powerpc/include/asm/prom.h   |   1 +
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 149 +-
 arch/powerpc/platforms/pseries/firmware.c |   1 +
 6 files changed, 328 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

diff --git a/Documentation/powerpc/associativity.rst 
b/Documentation/powerpc/associativity.rst
new file mode 100644
index ..5fa9352dfc05
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,177 @@
+
+NUMA resource associativity
+=
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux 
kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports different ways of communicating these 
resource
+grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
+associativity grouping. Form 0 is the older format and is now considered 
deprecated.
+
+Hypervisor indicates the type/form of associativity used via 
"ibm,arcitecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity. For Form 2 
associativity
+bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
+
+Form 0
+-
+Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
+
+Form 1
+-
+With Form 1 a combination of ibm,associativity-reference-points and 
ibm,associativity
+device tree properties are used to determine the NUMA distance between 
resource groups/domains.
+
+The “ibm,associativity” property contains one or more lists of numbers 
(domainID)
+representing the resource’s platform grouping domains.
+
+The “ibm,associativity-reference-points” property contains one or more list of 
numbers
+(domainID index) that represents the 1 based ordinal in the associativity 
lists of the
+least significant boundary, with subsequent entries indicating progressively 
higher
+significant boundaries.
+
+ex:
+{ primary domainID index, secondary domainID index, tertiary domainID index.. }
+
+Linux kernel uses the domainID of the least significant boundary (aka primary 
domain)
+as the NUMA node id. Linux kernel computes NUMA distance between two domains by
+recursively comparing if they belong to the same higher-level domains. For 
mismatch
+at every higher level of the resource group, the kernel doubles the NUMA 
distance between
+the comparing domains.
+
+Form 2
+---
+Form 2 associativity format adds separate device tree properties representing 
NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows 
flexible primary
+domain numbering. With numa distance computation now detached from the index 
value of
+"ibm,associativity" property, Form 2 allows a large number of primary domain 
ids at the
+same domainID index representing resource groups of different 
performance/latency characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in 
the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains one or more list numbers 
representing
+the domainIDs present in the system. The offset of the domainID in this 
property is considered
+the domainID index.
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, 
followed by
+N domainID encoded as with encode-int
+
+For ex:
+ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for 
domainID 8 is 1.
+
+"ibm,numa-distance-table" property contains one or more list of numbers 
representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with 
encode-int, followed by
+N distance values encoded as with encode-bytes. The max distance value we 
could encode is 255.
+
+For ex:
+ibm,numa-lookup-index-table =  {3, 0, 8, 40}
+ibm,numa-distance-table =  {9, 10, 20, 80,

[PATCH v3 6/8] powerpc/pseries: Add a helper for form1 cpu distance

2021-06-17 Thread Aneesh Kumar K.V

This helper is only used with the dispatch trace log collection.
A later patch will add Form2 affinity support and this change helps
in keeping that simpler. Also add a comment explaining we don't expect
the code to be called with FORM0

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c481f08d565b..d32729f235b8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
int dist = 0;
 
@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
 }
 
+int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+   /* We should not get called with FORM0 */
+   VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+
+   return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
+}
+
 /* must hold reference to node during call */
 static const __be32 *of_get_associativity(struct device_node *dev)
 {
-- 
2.31.1

[PATCH v3 5/8] powerpc/pseries: Consolidate NUMA distance update during boot

2021-06-17 Thread Aneesh Kumar K.V

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 135 +++--
 1 file changed, 88 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 645a95e3a7ea..c481f08d565b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-   const __be32 *associativity)
-{
-   int i;
-
-   if (affinity_form != FORM1_AFFINITY)
-   return;
-
-   for (i = 0; i < max_associativity_domain_index; i++) {
-   const __be32 *entry;
-
-   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
-   distance_lookup_table[nid][i] = of_read_number(entry, 1);
-   }
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
*associativity)
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
-
-   if (nid > 0 &&
-   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
-   /*
-* Skip the length field and send start of associativity array
-*/
-   initialize_distance_lookup_table(nid, associativity + 1);
-   }
-
 out:
return nid;
 }
@@ -291,10 +266,13 @@ static void __initialize_form1_numa_distance(const __be32 
*associativity)
 {
int i, nid;
 
+   if (affinity_form != FORM1_AFFINITY)
+   return;
+
if (of_read_number(associativity, 1) >= primary_domain_index) {
nid = of_read_number([primary_domain_index], 1);
 
-   for (i = 0; i < max_domain_index; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = 
[be32_to_cpu(distance_ref_points[i])];
@@ -474,6 +452,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+   struct assoc_arrays aa = { .arrays = NULL };
+   int default_nid = NUMA_NO_NODE;
+   int nid = default_nid;
+   int rc, index;
+
+   if ((primary_domain_index < 0) || !numa_enabled)
+   return default_nid;
+
+   rc = of_get_assoc_arrays();
+   if (rc)
+   return default_nid;
+
+   if (primary_domain_index <= aa.array_sz &&
+   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+   nid = of_read_number([index], 1);
+
+   if (nid == 0x || nid >= nr_node_ids)
+   nid = default_nid;
+   if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+   int i;
+   const __be32 *associativity;
+
+   index = lmb->aa_index * aa.array_sz;
+   associativity = [index];
+   /*
+* lookup array associativity entries have different 
format
+* There is no length of the array as the first element.
+*/
+   for (i = 0; i < max_associativity_domain_index; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i]) - 1];
+   distance_lookup_table[nid][i] = 
of_read_number(entry, 1);
+   }
+   }
+   }
+   return nid;
+}
+
 /*
  * This is like of_node_to_nid_single() for memory represented in the
  * ibm,dynamic-reconfiguration-memory node.
@@ -499,21 +519,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 
if (nid == 0x || nid >= nr_node_ids)
nid = default_nid;
-
-   if (nid > 0) {
-   index = lmb->aa_index * aa.array_sz;
-   initialize_distance_lookup_table(nid,
-   [index]);
-   }
}
-
return nid;
 }
 
 #ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
-   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
long rc, hwid;
 
/*
@@ -533,10 +546,22 @@ static int vphn_get_nid(long lcpu)
 
rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);

[PATCH v3 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update

2021-06-17 Thread Aneesh Kumar K.V

The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call. In
later patch we will remove updating NUMA distance when we are looking
for node id from associativity array.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c| 41 +++
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
 .../platforms/pseries/hotplug-memory.c|  2 +
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 4 files changed, 46 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0ec16999beef..645a95e3a7ea 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+   int i, nid;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   nid = of_read_number([primary_domain_index], 1);
+
+   for (i = 0; i < max_domain_index; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i])];
+   distance_lookup_table[nid][i] = of_read_number(entry, 
1);
+   }
+   }
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+   const __be32 *associativity;
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return;
+
+   __initialize_form1_numa_distance(associativity);
+   return;
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+   if (affinity_form == FORM0_AFFINITY)
+   return;
+   else if (affinity_form == FORM1_AFFINITY) {
+   initialize_form1_numa_distance(node);
+   return;
+   }
+}
+
 static int __init find_primary_domain_index(void)
 {
int index;
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7e970f81d8ff..778b6ab35f0d 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return saved_rc;
}
 
+   update_numa_distance(dn);
+
rc = dlpar_online_cpu(dn);
if (rc) {
saved_rc = rc;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..0e602c3b01ea 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb 
*lmb)
return -ENODEV;
}
 
+   update_numa_distance(lmb_node);
+
dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dr_node) {
dlpar_free_cc_nodes(lmb_node);
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..663a0859cf13 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
 void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
+void update_numa_distance(struct device_node *node);
 #endif /* _PSERIES_PSERIES_H */
-- 
2.31.1

[PATCH v3 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-06-17 Thread Aneesh Kumar K.V

Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   |  4 +--
 arch/powerpc/include/asm/prom.h   |  2 +-
 arch/powerpc/kernel/prom_init.c   |  2 +-
 arch/powerpc/mm/numa.c| 35 ++-
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPALASM_CONST(0x1000)
 #define FW_FEATURE_SET_MODEASM_CONST(0x4000)
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
 #define FW_FEATURE_DRMEM_V2ASM_CONST(0x0004)
 #define FW_FEATURE_DRC_INFOASM_CONST(0x0008)
@@ -69,7 +69,7 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-   FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+   FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI0x0201  /* PCIe/MSI support */
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580  /* FORM1 NUMA affinity */
 #define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_HP_EVT 0x0604  /* Hot Plug Event support */
 #define OV5_RESIZE_HPT 0x0601  /* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 41ed7e33d897..64b9593038a7 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec 
ibm_architecture_vec_template __initconst = {
 #else
0,
 #endif
-   .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
+   .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 132813dd1a6c..0ec16999beef 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_associativity_domain_index;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
 
-   if (!form1_affinity)
+   if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
int i;
 
-   if (!form1_affinity)
+   if (affinity_form != FORM1_AFFINITY)
return;
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
int index;
struct device_node *root;
 
+   /*
+* Check for which form of affinity.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL)) {
+   affinity_form = FORM1_AFFINITY;
+   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
+   affinity_form = FORM1_AFFINITY;
+   } else
+   affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -318,23

[PATCH v3 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
if (!numa_enabled)
goto out;
 
-   if (of_read_number(associativity, 1) >= min_common_depth)
-   nid = of_read_number([min_common_depth], 1);
+   if (of_read_number(associativity, 1) >= primary_domain_index)
+   nid = of_read_number([primary_domain_index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-   int depth;
+   int index;
struct device_node *root;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
}
 
if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
 
-   depth = of_read_number(_ref_points[1], 1);
+   index = of_read_number(_ref_points[1], 1);
}
 
/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
 
of_node_put(root);
-   return depth;
+   return index;
 
 err:
of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if ((min_common_depth < 0) || !numa_enabled)
+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth <= aa.array_sz &&
+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number([index], 1);
 
if (nid == 0x || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
 
-   min_common_depth = find_min_common_depth();
+   primary_domain_index = find_primary_domain_index();
 
-   if (min_common_depth < 0) {
+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
 
-   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
 
/*
 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
 
-   max_nodes = of_read_number([min_common_depth], 1);
+   max_nodes = of_read_number([primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
prop_length /= sizeof(int);
-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
 
index = of_read_number(associativity, 1);
-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number([index - 1], 1);
 
 out:
-- 
2.31.1

[PATCH v3 0/8] Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V

Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. This also allows PAPR SCM device
to use better associativity when using the device as DAX KMEM
device. More details can be found in patch 7.

$ ndctl list -N -v
[
  {
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":1071644672,
"uuid":"37dea198-ddb5-4e42-915a-99a915e24188",
"raw_uuid":"148deeaa-4a2f-41d1-8d74-fd9a942d26ba",
"daxregion":{
  "id":0,
  "size":1071644672,
  "devices":[
{
  "chardev":"dax0.0",
  "size":1071644672,
  "target_node":4,
  "mode":"devdax"
}
  ]
},
"align":2097152,
"numa_node":1
  }
]

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX KMEM
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  22  33  255 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  255 
  4:  255  255  255  255  10 
# 

The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=22 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=255 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=255 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=255 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4,device-node=1

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/

Changes from v2:
* Add nvdimm list to Cc:
* update PATCH 8 commit message.

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index

Aneesh Kumar K.V (8):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: rename distance_ref_points_depth to
max_associativity_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate DLPAR NUMA distance update
  powerpc/pseries: Consolidate NUMA distance update during boot
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity
  powerpc/papr_scm: Use FORM2 associativity details

 Documentation/powerpc/associativity.rst   | 177 +++
 arch/powerpc/include/asm/firmware.h   |   7 +-
 arch/powerpc/include/asm/prom.h   |   3 +-
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 436 ++
 arch/powerpc/platforms/pseries/firmware.c |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/papr_scm.c |  26 +-
 arch/powerpc/platforms/pseries/pseries.h  |   2 +
 10 files changed, 560 insertions(+), 101 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1

[PATCH] Fix for "powerpc/64: use interrupt restart table to speed up return from interrupt" SOFTE loading

2021-06-17 Thread Nicholas Piggin

This patch loads SOFTE(r1) with lbz, which existing code stores to with
std. This causes interrupt flag corruption on big endian (as Michael
pointed out to me, lbz happens to grab the correct byte on LE kernels
which explains why I didn't run into it).

Signed-off-by: Nicholas Piggin 
---
64e still seems to have an issue I'm chasing, and I still need to send a
fix for the cpu hotplug lock warning.

 arch/powerpc/kernel/interrupt_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index 76b827ae849a..4bf859e7dc25 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -618,7 +618,7 @@ RESTART_TABLE(.Linterrupt_return_\srr\()_user_rst_start, 
.Linterrupt_return_\srr
 
std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */
 .Linterrupt_return_\srr\()_kernel_rst_start:
-   lbz r11,SOFTE(r1)
+   ld  r11,SOFTE(r1)
cmpwi   r11,IRQS_ENABLED
stb r11,PACAIRQSOFTMASK(r13)
bne 1f
-- 
2.23.0

[PATCH v2 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread Aneesh Kumar K.V

FORM2 introduce a concept of secondary domain which is identical to the
conceept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. For DAX kmem use the logical domain
id introduced in FORM2. This new numa node

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c| 28 +++
 arch/powerpc/platforms/pseries/papr_scm.c | 26 +
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5a7d94960fb7..cd3ae7ff77ac 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
*associativity)
return nid;
 }
 
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, 
int *secondary)
+{
+   int secondary_index;
+   const __be32 *associativity;
+
+   if (!numa_enabled) {
+   *primary = NUMA_NO_NODE;
+   *secondary = NUMA_NO_NODE;
+   return 0;
+   }
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return -ENODEV;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   *primary = of_read_number([primary_domain_index], 
1);
+   secondary_index = of_read_number(_ref_points[1], 1);
+   *secondary = of_read_number([secondary_index], 1);
+   }
+   if (*primary == 0x || *primary >= nr_node_ids)
+   *primary = NUMA_NO_NODE;
+
+   if (*secondary == 0x || *secondary >= nr_node_ids)
+   *secondary = NUMA_NO_NODE;
+   return 0;
+}
+
 /* Returns the nid associated with the given device tree node,
  * or -1 if not found.
  */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index ef26fe40efb0..9bf2f1f3ddc5 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include "pseries.h"
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
 struct papr_scm_priv {
struct platform_device *pdev;
struct device_node *dn;
+   int numa_node;
+   int target_node;
uint32_t drc_index;
uint64_t blocks;
uint64_t block_size;
@@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
struct nd_mapping_desc mapping;
struct nd_region_desc ndr_desc;
unsigned long dimm_flags;
-   int target_nid, online_nid;
ssize_t stat_size;
 
p->bus_desc.ndctl = papr_scm_ndctl;
@@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
 
memset(_desc, 0, sizeof(ndr_desc));
-   target_nid = dev_to_node(>pdev->dev);
-   online_nid = numa_map_to_online_node(target_nid);
-   ndr_desc.numa_node = online_nid;
-   ndr_desc.target_node = target_nid;
+   ndr_desc.numa_node = p->numa_node;
+   ndr_desc.target_node = p->target_node;
ndr_desc.res = >res;
ndr_desc.of_node = p->dn;
ndr_desc.provider_data = p;
@@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
ndr_desc.res, p->dn);
goto err;
}
-   if (target_nid != online_nid)
-   dev_info(dev, "Region registered with target node %d and online 
node %d",
-target_nid, online_nid);
 
mutex_lock(_ndr_lock);
list_add_tail(>region_list, _nd_regions);
@@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
struct papr_scm_priv *p;
const char *uuid_str;
u64 uuid[2];
-   int rc;
+   int rc, numa_node;
 
/* check we have all the required DT properties */
if (of_property_read_u32(dn, "ibm,my-drc-index", _index)) {
@@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
return -ENODEV;
}
 
-
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
 
+   if (get_primary_and_secondary_domain(dn, >target_node, _node)) {
+   dev_err(>dev, "%pOF: missing NUMA attributes!\n", dn);
+   rc = -ENODEV;
+   goto err;
+   }
+   p->numa_node = numa_map_to_online_node(numa_node);
+   if (numa_node != p->numa_node)
+   dev_info(>dev, "Region registered with online node %d and 
device tree node %d",
+p->numa_node, numa_node);
+
/* Initialize the dimm mutex */
mutex_init(>health_mutex);
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 663a0859cf13..9c2a1fc9ded1

[PATCH v2 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V

PAPR interface currently supports two different ways of communicating resource
grouping details to the OS. These are referred to as Form 0 and Form 1
associativity grouping. Form 0 is the older format and is now considered
deprecated. This patch adds another resource grouping named FORM2.

Signed-off-by: Daniel Henrique Barboza 
Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/powerpc/associativity.rst   | 177 ++
 arch/powerpc/include/asm/firmware.h   |   3 +-
 arch/powerpc/include/asm/prom.h   |   1 +
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 149 +-
 arch/powerpc/platforms/pseries/firmware.c |   1 +
 6 files changed, 328 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

diff --git a/Documentation/powerpc/associativity.rst 
b/Documentation/powerpc/associativity.rst
new file mode 100644
index ..5fa9352dfc05
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,177 @@
+
+NUMA resource associativity
+=
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux 
kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports different ways of communicating these 
resource
+grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
+associativity grouping. Form 0 is the older format and is now considered 
deprecated.
+
+Hypervisor indicates the type/form of associativity used via 
"ibm,arcitecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity. For Form 2 
associativity
+bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
+
+Form 0
+-
+Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
+
+Form 1
+-
+With Form 1 a combination of ibm,associativity-reference-points and 
ibm,associativity
+device tree properties are used to determine the NUMA distance between 
resource groups/domains.
+
+The ???ibm,associativity??? property contains one or more lists of numbers 
(domainID)
+representing the resource???s platform grouping domains.
+
+The ???ibm,associativity-reference-points??? property contains one or more 
list of numbers
+(domainID index) that represents the 1 based ordinal in the associativity 
lists of the
+least significant boundary, with subsequent entries indicating progressively 
higher
+significant boundaries.
+
+ex:
+{ primary domainID index, secondary domainID index, tertiary domainID index.. }
+
+Linux kernel uses the domainID of the least significant boundary (aka primary 
domain)
+as the NUMA node id. Linux kernel computes NUMA distance between two domains by
+recursively comparing if they belong to the same higher-level domains. For 
mismatch
+at every higher level of the resource group, the kernel doubles the NUMA 
distance between
+the comparing domains.
+
+Form 2
+---
+Form 2 associativity format adds separate device tree properties representing 
NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows 
flexible primary
+domain numbering. With numa distance computation now detached from the index 
value of
+"ibm,associativity" property, Form 2 allows a large number of primary domain 
ids at the
+same domainID index representing resource groups of different 
performance/latency characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in 
the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains one or more list numbers 
representing
+the domainIDs present in the system. The offset of the domainID in this 
property is considered
+the domainID index.
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, 
followed by
+N domainID encoded as with encode-int
+
+For ex:
+ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for 
domainID 8 is 1.
+
+"ibm,numa-distance-table" property contains one or more list of numbers 
representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with 
encode-int, followed by
+N distance values encoded as with encode-bytes. The max distance value we 
could encode is 255.
+
+For ex:
+ibm,numa-lookup-index-table =  {3, 0, 8, 40}
+ibm,numa-distance-table =  {9, 10,

[PATCH v2 6/8] powerpc/pseries: Add a helper for form1 cpu distance

2021-06-17 Thread Aneesh Kumar K.V

This helper is only used with the dispatch trace log collection.
A later patch will add Form2 affinity support and this change helps
in keeping that simpler. Also add a comment explaining we don't expect
the code to be called with FORM0

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c481f08d565b..d32729f235b8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
int dist = 0;
 
@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
 }
 
+int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+   /* We should not get called with FORM0 */
+   VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+
+   return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
+}
+
 /* must hold reference to node during call */
 static const __be32 *of_get_associativity(struct device_node *dev)
 {
-- 
2.31.1

[PATCH v2 5/8] powerpc/pseries: Consolidate NUMA distance update during boot

2021-06-17 Thread Aneesh Kumar K.V

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 135 +++--
 1 file changed, 88 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 645a95e3a7ea..c481f08d565b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-   const __be32 *associativity)
-{
-   int i;
-
-   if (affinity_form != FORM1_AFFINITY)
-   return;
-
-   for (i = 0; i < max_associativity_domain_index; i++) {
-   const __be32 *entry;
-
-   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
-   distance_lookup_table[nid][i] = of_read_number(entry, 1);
-   }
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
*associativity)
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
-
-   if (nid > 0 &&
-   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
-   /*
-* Skip the length field and send start of associativity array
-*/
-   initialize_distance_lookup_table(nid, associativity + 1);
-   }
-
 out:
return nid;
 }
@@ -291,10 +266,13 @@ static void __initialize_form1_numa_distance(const __be32 
*associativity)
 {
int i, nid;
 
+   if (affinity_form != FORM1_AFFINITY)
+   return;
+
if (of_read_number(associativity, 1) >= primary_domain_index) {
nid = of_read_number([primary_domain_index], 1);
 
-   for (i = 0; i < max_domain_index; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = 
[be32_to_cpu(distance_ref_points[i])];
@@ -474,6 +452,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+   struct assoc_arrays aa = { .arrays = NULL };
+   int default_nid = NUMA_NO_NODE;
+   int nid = default_nid;
+   int rc, index;
+
+   if ((primary_domain_index < 0) || !numa_enabled)
+   return default_nid;
+
+   rc = of_get_assoc_arrays();
+   if (rc)
+   return default_nid;
+
+   if (primary_domain_index <= aa.array_sz &&
+   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+   nid = of_read_number([index], 1);
+
+   if (nid == 0x || nid >= nr_node_ids)
+   nid = default_nid;
+   if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+   int i;
+   const __be32 *associativity;
+
+   index = lmb->aa_index * aa.array_sz;
+   associativity = [index];
+   /*
+* lookup array associativity entries have different 
format
+* There is no length of the array as the first element.
+*/
+   for (i = 0; i < max_associativity_domain_index; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i]) - 1];
+   distance_lookup_table[nid][i] = 
of_read_number(entry, 1);
+   }
+   }
+   }
+   return nid;
+}
+
 /*
  * This is like of_node_to_nid_single() for memory represented in the
  * ibm,dynamic-reconfiguration-memory node.
@@ -499,21 +519,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 
if (nid == 0x || nid >= nr_node_ids)
nid = default_nid;
-
-   if (nid > 0) {
-   index = lmb->aa_index * aa.array_sz;
-   initialize_distance_lookup_table(nid,
-   [index]);
-   }
}
-
return nid;
 }
 
 #ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
-   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
long rc, hwid;
 
/*
@@ -533,10 +546,22 @@ static int vphn_get_nid(long lcpu)
 
rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);

[PATCH v2 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update

2021-06-17 Thread Aneesh Kumar K.V

The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call. In
later patch we will remove updating NUMA distance when we are looking
for node id from associativity array.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c| 41 +++
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
 .../platforms/pseries/hotplug-memory.c|  2 +
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 4 files changed, 46 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0ec16999beef..645a95e3a7ea 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+   int i, nid;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   nid = of_read_number([primary_domain_index], 1);
+
+   for (i = 0; i < max_domain_index; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i])];
+   distance_lookup_table[nid][i] = of_read_number(entry, 
1);
+   }
+   }
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+   const __be32 *associativity;
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return;
+
+   __initialize_form1_numa_distance(associativity);
+   return;
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+   if (affinity_form == FORM0_AFFINITY)
+   return;
+   else if (affinity_form == FORM1_AFFINITY) {
+   initialize_form1_numa_distance(node);
+   return;
+   }
+}
+
 static int __init find_primary_domain_index(void)
 {
int index;
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7e970f81d8ff..778b6ab35f0d 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return saved_rc;
}
 
+   update_numa_distance(dn);
+
rc = dlpar_online_cpu(dn);
if (rc) {
saved_rc = rc;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..0e602c3b01ea 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb 
*lmb)
return -ENODEV;
}
 
+   update_numa_distance(lmb_node);
+
dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dr_node) {
dlpar_free_cc_nodes(lmb_node);
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..663a0859cf13 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
 void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
+void update_numa_distance(struct device_node *node);
 #endif /* _PSERIES_PSERIES_H */
-- 
2.31.1

[PATCH v2 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-06-17 Thread Aneesh Kumar K.V

Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   |  4 +--
 arch/powerpc/include/asm/prom.h   |  2 +-
 arch/powerpc/kernel/prom_init.c   |  2 +-
 arch/powerpc/mm/numa.c| 35 ++-
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPALASM_CONST(0x1000)
 #define FW_FEATURE_SET_MODEASM_CONST(0x4000)
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
 #define FW_FEATURE_DRMEM_V2ASM_CONST(0x0004)
 #define FW_FEATURE_DRC_INFOASM_CONST(0x0008)
@@ -69,7 +69,7 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-   FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+   FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI0x0201  /* PCIe/MSI support */
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580  /* FORM1 NUMA affinity */
 #define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_HP_EVT 0x0604  /* Hot Plug Event support */
 #define OV5_RESIZE_HPT 0x0601  /* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 41ed7e33d897..64b9593038a7 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec 
ibm_architecture_vec_template __initconst = {
 #else
0,
 #endif
-   .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
+   .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 132813dd1a6c..0ec16999beef 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_associativity_domain_index;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
 
-   if (!form1_affinity)
+   if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
int i;
 
-   if (!form1_affinity)
+   if (affinity_form != FORM1_AFFINITY)
return;
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
int index;
struct device_node *root;
 
+   /*
+* Check for which form of affinity.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL)) {
+   affinity_form = FORM1_AFFINITY;
+   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
+   affinity_form = FORM1_AFFINITY;
+   } else
+   affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -318,23

[PATCH v2 2/8] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..132813dd1a6c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+static int max_associativity_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 
int i, index;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
index = be32_to_cpu(distance_ref_points[i]);
if (cpu1_assoc[index] == cpu2_assoc[index])
break;
@@ -193,7 +193,7 @@ int __node_distance(int a, int b)
if (!form1_affinity)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
break;
 
@@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
if (!form1_affinity)
return;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = [be32_to_cpu(distance_ref_points[i]) - 1];
@@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
nid = NUMA_NO_NODE;
 
if (nid > 0 &&
-   of_read_number(associativity, 1) >= distance_ref_points_depth) {
+   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
/*
 * Skip the length field and send start of associativity array
 */
@@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
 */
distance_ref_points = of_get_property(root,
"ibm,associativity-reference-points",
-   _ref_points_depth);
+   _associativity_domain_index);
 
if (!distance_ref_points) {
dbg("NUMA: ibm,associativity-reference-points not found.\n");
goto err;
}
 
-   distance_ref_points_depth /= sizeof(int);
+   max_associativity_domain_index /= sizeof(int);
 
if (firmware_has_feature(FW_FEATURE_OPAL) ||
firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
@@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
if (form1_affinity) {
index = of_read_number(distance_ref_points, 1);
} else {
-   if (distance_ref_points_depth < 2) {
+   if (max_associativity_domain_index < 2) {
printk(KERN_WARNING "NUMA: "
"short ibm,associativity-reference-points\n");
goto err;
@@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
 * Warn and cap if the hardware supports more than
 * MAX_DISTANCE_REF_POINTS domains.
 */
-   if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+   if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
printk(KERN_WARNING "NUMA: distance array capped at "
"%d entries\n", MAX_DISTANCE_REF_POINTS);
-   distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+   max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
}
 
of_node_put(root);
-- 
2.31.1

[PATCH v2 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-17 Thread Aneesh Kumar K.V

No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
if (!numa_enabled)
goto out;
 
-   if (of_read_number(associativity, 1) >= min_common_depth)
-   nid = of_read_number([min_common_depth], 1);
+   if (of_read_number(associativity, 1) >= primary_domain_index)
+   nid = of_read_number([primary_domain_index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-   int depth;
+   int index;
struct device_node *root;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
}
 
if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
 
-   depth = of_read_number(_ref_points[1], 1);
+   index = of_read_number(_ref_points[1], 1);
}
 
/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
 
of_node_put(root);
-   return depth;
+   return index;
 
 err:
of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if ((min_common_depth < 0) || !numa_enabled)
+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth <= aa.array_sz &&
+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number([index], 1);
 
if (nid == 0x || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
 
-   min_common_depth = find_min_common_depth();
+   primary_domain_index = find_primary_domain_index();
 
-   if (min_common_depth < 0) {
+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
 
-   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
 
/*
 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
 
-   max_nodes = of_read_number([min_common_depth], 1);
+   max_nodes = of_read_number([primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
prop_length /= sizeof(int);
-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
 
index = of_read_number(associativity, 1);
-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number([index - 1], 1);
 
 out:
-- 
2.31.1

[PATCH v2 0/8] Add support for FORM2 associativity

2021-06-17 Thread Aneesh Kumar K.V

Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. This also allows PAPR SCM device
to use better associativity when using the device as DAX KMEM
device. More details can be found in patch 7.

$ ndctl list -N -v
[
  {
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":1071644672,
"uuid":"37dea198-ddb5-4e42-915a-99a915e24188",
"raw_uuid":"148deeaa-4a2f-41d1-8d74-fd9a942d26ba",
"daxregion":{
  "id":0,
  "size":1071644672,
  "devices":[
{
  "chardev":"dax0.0",
  "size":1071644672,
  "target_node":4,
  "mode":"devdax"
}
  ]
},
"align":2097152,
"numa_node":1
  }
]

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX KMEM
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  22  33  255 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  255 
  4:  255  255  255  255  10 
# 

The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=22 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=255 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=255 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=255 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4,device-node=1

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index

Aneesh Kumar K.V (8):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: rename distance_ref_points_depth to
max_associativity_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate DLPAR NUMA distance update
  powerpc/pseries: Consolidate NUMA distance update during boot
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity
  powerpc/papr_scm: Use FORM2 associativity details

 Documentation/powerpc/associativity.rst   | 177 +++
 arch/powerpc/include/asm/firmware.h   |   7 +-
 arch/powerpc/include/asm/prom.h   |   3 +-
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 436 ++
 arch/powerpc/platforms/pseries/firmware.c |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/papr_scm.c |  26 +-
 arch/powerpc/platforms/pseries/pseries.h  |   2 +
 10 files changed, 560 insertions(+), 101 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1

Re: [PATCH v14 3/4] mm: define default MAX_PTRS_PER_* in include/pgtable.h

2021-06-17 Thread Christophe Leroy





Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
constant for cases which need a compile-time constant (e.g. fixed-size
arrays).

powerpc likewise has boot-time selectable MMU features which can cause
other mm "constants" to vary. For KASAN, we have some static
PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions
in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless
an architecture has defined MAX_PTRS_PER_x in its arch headers.

Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
we're at it: both can just pick up the default now.

Signed-off-by: Daniel Axtens 


Reviewed-by: Christophe Leroy 



---

s390 was compile tested only.
---
  arch/s390/include/asm/pgtable.h |  2 --
  include/asm-generic/pgtable-nop4d.h |  1 -
  include/linux/pgtable.h | 22 ++
  3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7c66ae5d7e32..cf05954ce013 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -342,8 +342,6 @@ static inline int is_module_addr(void *addr)
  #define PTRS_PER_P4D  _CRST_ENTRIES
  #define PTRS_PER_PGD  _CRST_ENTRIES
  
-#define MAX_PTRS_PER_P4D	PTRS_PER_P4D

-
  /*
   * Segment table and region3 table entry encoding
   * (R = read-only, I = invalid, y = young bit):
diff --git a/include/asm-generic/pgtable-nop4d.h 
b/include/asm-generic/pgtable-nop4d.h
index ce2cbb3c380f..2f6b1befb129 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -9,7 +9,6 @@
  typedef struct { pgd_t pgd; } p4d_t;
  
  #define P4D_SHIFT		PGDIR_SHIFT

-#define MAX_PTRS_PER_P4D   1
  #define PTRS_PER_P4D  1
  #define P4D_SIZE  (1UL << P4D_SHIFT)
  #define P4D_MASK  (~(P4D_SIZE-1))
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9e6f71265f72..69700e3e615f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1625,4 +1625,26 @@ typedef unsigned int pgtbl_mod_mask;
  #define pte_leaf_size(x) PAGE_SIZE
  #endif
  
+/*

+ * Some architectures have MMUs that are configurable or selectable at boot
+ * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
+ * helps to have a static maximum value.
+ */
+
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
+#ifndef MAX_PTRS_PER_P4D
+#define MAX_PTRS_PER_P4D PTRS_PER_P4D
+#endif
+
  #endif /* _LINUX_PGTABLE_H */

Re: [PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Christophe Leroy





Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
  - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
  - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

I haven't been able to exercise the arch hook error for !GENERIC as I
don't have a particularly modern aarch64 toolchain or a lot of experience
cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.
---
  mm/kasan/common.c  | 4 
  mm/kasan/generic.c | 3 +++
  mm/kasan/kasan.h   | 8 
  mm/kasan/shadow.c  | 8 
  4 files changed, 23 insertions(+)

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8f450bc28045..b18abaf8c78e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -449,6 +449,14 @@ static inline void kasan_poison_last_granule(const void 
*address, size_t size) {
  
  #endif /* CONFIG_KASAN_GENERIC */
  
+#ifndef kasan_arch_is_ready

+static inline bool kasan_arch_is_ready(void)   { return true; }
+#else
+#if !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+#endif


Would be cleaner and more readable as

+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void)   { return true; }
+#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif


+
  /*
   * Exported functions for interfaces called from assembly or from generated
   * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 082ee5b6d9a1..3c7f7efe6f68 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -73,6 +73,10 @@ void kasan_poison(const void *addr, size_t size, u8 value, 
bool init)
  {
void *shadow_start, *shadow_end;
  
+	/* Don't touch the shadow memory if arch isn't ready */

+   if (!kasan_arch_is_ready())
+   return;
+
/*
 * Perform shadow offset calculation based on untagged address, as
 * some of the callers (e.g. kasan_poison_object_data) pass tagged
@@ -99,6 +103,10 @@ EXPORT_SYMBOL(kasan_poison);
  #ifdef CONFIG_KASAN_GENERIC
  void kasan_poison_last_granule(const void *addr, size_t size)
  {
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
*shadow = size & KASAN_GRANULE_MASK;

Re: [PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Christophe Leroy





Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
  - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
  - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

I haven't been able to exercise the arch hook error for !GENERIC as I
don't have a particularly modern aarch64 toolchain or a lot of experience
cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.


Modern toolchains are available here 
https://mirrors.edge.kernel.org/pub/tools/crosstool/

Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-17 Thread Athira Rajeev

On 16-Jun-2021, at 11:56 AM, Christophe Leroy  wrote:Le 16/06/2021 à 05:40, Athira Rajeev a écrit :On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan  wrote:On 6/15/21 8:35 PM, Christophe Leroy wrote:For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 5.12 and 5.11.Runs ok on 5.10. I'm starting bisecting now.Thanks for reporting, got the issue. What has happened in this case is that, pmu device is not registeredand trying to access the instruction point which will land in perf_instruction_pointer(). And recently I have addeda workaround patch for power10 DD1 which has caused this breakage. My bad. We are working on a fix patchfor the same and will post it out. Sorry again.Hi Christophe,Can you please try with below patch in your environment and test if it works for you.From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001From: Athira Rajeev Date: Tue, 15 Jun 2021 22:28:11 -0400Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when pmu is not setOn systems without any specific PMU driver support registered, runningperf record causes oops:[   38.841073] NIP [c013af54] perf_instruction_pointer+0x24/0x100[   38.841079] LR [c03c7358] perf_prepare_sample+0x4e8/0x820[   38.841085] --- interrupt: 300[   38.841088] [c0001cf03440] [c03c6ef8] perf_prepare_sample+0x88/0x820 (unreliable)[   38.841096] [c0001cf034a0] [c03c76d0] perf_event_output_forward+0x40/0xc0[   38.841104] [c0001cf03520] [c03b45e8] __perf_event_overflow+0x88/0x1b0[   38.841112] [c0001cf03570] [c03b480c] perf_swevent_hrtimer+0xfc/0x1a0[   38.841119] [c0001cf03740] [c02399cc] __hrtimer_run_queues+0x17c/0x380[   38.841127] [c0001cf037c0] [c023a5f8] hrtimer_interrupt+0x128/0x2f0[   38.841135] [c0001cf03870] [c002962c] timer_interrupt+0x13c/0x370[   38.841143i] [c0001cf038d0] [c0009ba4] decrementer_common_virt+0x1a4/0x1b0[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0During perf record session, perf_instruction_pointer() is called tocapture the sample ip. This function in core-book3s accesses ppmu->flags.If a platform specific PMU driver is not registered, ppmu is set to NULLand accessing its members results in a crash. Fix this crash by checkingif ppmu is set.Signed-off-by: Athira Rajeev Reported-by: Christophe Leroy Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")Cc: sta...@vger.kernel.orgTested-by: Christophe Leroy Hi Christophe,Thanks for testing with the change. I have a newer version where I have added braces around the check.Can you please check once and can I add your tested-by for the below patch.From 621cd0449c8503a016c0b1ae63639061aa5134a8 Mon Sep 17 00:00:00 2001From: Athira Rajeev Date: Tue, 15 Jun 2021 22:28:11 -0400Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when pmu is not setOn systems without any specific PMU driver support registered, runningperf record causes Oops.The relevant portion from call trace:BUG: Kernel NULL pointer dereference on read at 0x0040Faulting instruction address: 0xc0021f0cOops: Kernel access of bad area, sig: 11 [#1]BE PAGE_SIZE=4K PREEMPT CMPCPROSAF3000 DIE NOTIFICATIONCPU: 0 PID: 442 Comm: null_syscall Not tainted 5.13.0-rc6-s3k-dev-01645-g7649ee3d2957 #5164NIP:  c0021f0c LR: c00e8ad8 CTR: c00d8a5cNIP [c0021f0c] perf_instruction_pointer+0x10/0x60LR [c00e8ad8] perf_prepare_sample+0x344/0x674Call Trace:[e6775880] [c00e8810] perf_prepare_sample+0x7c/0x674 (unreliable)[e67758c0] [c00e8e44] perf_event_output_forward+0x3c/0x94[e6775910] [c00dea8c] __perf_event_overflow+0x74/0x14c[e6775930] [c00dec5c] perf_swevent_hrtimer+0xf8/0x170[e6775a40] [c008c8d0] __hrtimer_run_queues.constprop.0+0x160/0x318[e6775a90] [c008d94c] hrtimer_interrupt+0x148/0x3b0[e6775ae0] [c000c0c0] timer_interrupt+0xc4/0x22c[e6775b10] [c00046f0] Decrementer_virt+0xb8/0xbcDuring perf record session, perf_instruction_pointer() is called tocapture the sample ip. This function in core-book3s accesses ppmu->flags.If a platform specific PMU driver is not registered, ppmu is set to NULLand accessing its members results in a crash. Fix this crash by checkingif ppmu is set.Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")Signed-off-by: Athira Rajeev Reported-by: Christophe Leroy --- arch/powerpc/perf/core-book3s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.cindex 16d4d1b..5162241 100644--- a/arch/powerpc/perf/core-book3s.c+++ b/arch/powerpc/perf/core-book3s.c@@ -2254,7 +2254,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) 	bool use_siar = regs_use_siar(regs); 	unsigned long siar = mfspr(SPRN_SIAR);-	if (ppmu->flags & PPMU_P10_DD1) {+	if (ppmu && (ppmu->flags & PPMU_P10_DD1)) { 		if (siar) 			return siar; 		else-- 1.8.3.1ThanksAthira--- arch/powerpc/perf/core-book3s.c | 2 +- 1

Re: [PATCH v12 00/12] Restricted DMA

2021-06-17 Thread Claire Chang

v13: https://lore.kernel.org/patchwork/cover/1448001/

[PATCH v14 4/4] kasan: use MAX_PTRS_PER_* for early shadow tables

2021-06-17 Thread Daniel Axtens

powerpc has a variable number of PTRS_PER_*, set at runtime based
on the MMU that the kernel is booted under.

This means the PTRS_PER_* are no longer constants, and therefore
breaks the build. Switch to using MAX_PTRS_PER_*, which are constant.

Suggested-by: Christophe Leroy 
Suggested-by: Balbir Singh 
Reviewed-by: Christophe Leroy 
Reviewed-by: Balbir Singh 
Signed-off-by: Daniel Axtens 
---
 include/linux/kasan.h | 6 +++---
 mm/kasan/init.c   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 768d7d342757..5310e217bd74 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -41,9 +41,9 @@ struct kunit_kasan_expectation {
 #endif
 
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
-extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
-extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
+extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
 extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 
 int kasan_populate_early_shadow(const void *shadow_start,
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 348f31d15a97..cc64ed6858c6 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
 static inline bool kasan_pud_table(p4d_t p4d)
 {
return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
 static inline bool kasan_pmd_table(pud_t pud)
 {
return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
return false;
 }
 #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
__page_aligned_bss;
 
 static inline bool kasan_pte_table(pmd_t pmd)
-- 
2.30.2

[PATCH v14 3/4] mm: define default MAX_PTRS_PER_* in include/pgtable.h

2021-06-17 Thread Daniel Axtens

Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
constant for cases which need a compile-time constant (e.g. fixed-size
arrays).

powerpc likewise has boot-time selectable MMU features which can cause
other mm "constants" to vary. For KASAN, we have some static
PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions
in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless
an architecture has defined MAX_PTRS_PER_x in its arch headers.

Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
we're at it: both can just pick up the default now.

Signed-off-by: Daniel Axtens 

---

s390 was compile tested only.
---
 arch/s390/include/asm/pgtable.h |  2 --
 include/asm-generic/pgtable-nop4d.h |  1 -
 include/linux/pgtable.h | 22 ++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7c66ae5d7e32..cf05954ce013 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -342,8 +342,6 @@ static inline int is_module_addr(void *addr)
 #define PTRS_PER_P4D   _CRST_ENTRIES
 #define PTRS_PER_PGD   _CRST_ENTRIES
 
-#define MAX_PTRS_PER_P4D   PTRS_PER_P4D
-
 /*
  * Segment table and region3 table entry encoding
  * (R = read-only, I = invalid, y = young bit):
diff --git a/include/asm-generic/pgtable-nop4d.h 
b/include/asm-generic/pgtable-nop4d.h
index ce2cbb3c380f..2f6b1befb129 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -9,7 +9,6 @@
 typedef struct { pgd_t pgd; } p4d_t;
 
 #define P4D_SHIFT  PGDIR_SHIFT
-#define MAX_PTRS_PER_P4D   1
 #define PTRS_PER_P4D   1
 #define P4D_SIZE   (1UL << P4D_SHIFT)
 #define P4D_MASK   (~(P4D_SIZE-1))
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9e6f71265f72..69700e3e615f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1625,4 +1625,26 @@ typedef unsigned int pgtbl_mod_mask;
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
 
+/*
+ * Some architectures have MMUs that are configurable or selectable at boot
+ * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
+ * helps to have a static maximum value.
+ */
+
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
+#ifndef MAX_PTRS_PER_P4D
+#define MAX_PTRS_PER_P4D PTRS_PER_P4D
+#endif
+
 #endif /* _LINUX_PGTABLE_H */
-- 
2.30.2

[PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Daniel Axtens

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
 - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
 - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

I haven't been able to exercise the arch hook error for !GENERIC as I
don't have a particularly modern aarch64 toolchain or a lot of experience
cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.
---
 mm/kasan/common.c  | 4 
 mm/kasan/generic.c | 3 +++
 mm/kasan/kasan.h   | 8 
 mm/kasan/shadow.c  | 8 
 4 files changed, 23 insertions(+)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 10177cc26d06..0ad615f3801d 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -331,6 +331,10 @@ static inline bool kasan_slab_free(struct kmem_cache 
*cache, void *object,
u8 tag;
void *tagged_object;
 
+   /* Bail if the arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return false;
+
tag = get_tag(object);
tagged_object = object;
object = kasan_reset_tag(object);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 53cbf28859b5..c3f5ba7a294a 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -163,6 +163,9 @@ static __always_inline bool check_region_inline(unsigned 
long addr,
size_t size, bool write,
unsigned long ret_ip)
 {
+   if (!kasan_arch_is_ready())
+   return true;
+
if (unlikely(size == 0))
return true;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8f450bc28045..b18abaf8c78e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -449,6 +449,14 @@ static inline void kasan_poison_last_granule(const void 
*address, size_t size) {
 
 #endif /* CONFIG_KASAN_GENERIC */
 
+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void)   { return true; }
+#else
+#if !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+#endif
+
 /*
  * Exported functions for interfaces called from assembly or from generated
  * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 082ee5b6d9a1..3c7f7efe6f68 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -73,6 +73,10 @@ void kasan_poison(const void *addr, size_t size, u8 value, 
bool init)
 {
void *shadow_start, *shadow_end;
 
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
/*
 * Perform shadow offset calculation based on untagged address, as
 * some of the callers (e.g. kasan_poison_object_data) pass tagged
@@ -99,6 +103,10 @@ EXPORT_SYMBOL(kasan_poison);
 #ifdef CONFIG_KASAN_GENERIC
 void kasan_poison_last_granule(const void *addr, size_t size)
 {
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
*shadow = size & KASAN_GRANULE_MASK;
-- 
2.30.2

[PATCH v14 1/4] kasan: allow an architecture to disable inline instrumentation

2021-06-17 Thread Daniel Axtens

For annoying architectural reasons, it's very difficult to support inline
instrumentation on powerpc64.*

Add a Kconfig flag to allow an arch to disable inline. (It's a bit
annoying to be 'backwards', but I'm not aware of any way to have
an arch force a symbol to be 'n', rather than 'y'.)

We also disable stack instrumentation in this case as it does things that
are functionally equivalent to inline instrumentation, namely adding
code that touches the shadow directly without going through a C helper.

* on ppc64 atm, the shadow lives in virtual memory and isn't accessible in
real mode. However, before we turn on virtual memory, we parse the device
tree to determine which platform and MMU we're running under. That calls
generic DT code, which is instrumented. Inline instrumentation in DT would
unconditionally attempt to touch the shadow region, which we won't have
set up yet, and would crash. We can make outline mode wait for the arch to
be ready, but we can't change what the compiler inserts for inline mode.

Signed-off-by: Daniel Axtens 
---
 lib/Kconfig.kasan | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index cffc2ebbf185..cb5e02d09e11 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -12,6 +12,15 @@ config HAVE_ARCH_KASAN_HW_TAGS
 config HAVE_ARCH_KASAN_VMALLOC
bool
 
+config ARCH_DISABLE_KASAN_INLINE
+   bool
+   help
+ Sometimes an architecture might not be able to support inline
+ instrumentation but might be able to support outline instrumentation.
+ This option allows an architecture to prevent inline and stack
+ instrumentation from being enabled.
+
+
 config CC_HAS_KASAN_GENERIC
def_bool $(cc-option, -fsanitize=kernel-address)
 
@@ -130,6 +139,7 @@ config KASAN_OUTLINE
 
 config KASAN_INLINE
bool "Inline instrumentation"
+   depends on !ARCH_DISABLE_KASAN_INLINE
help
  Compiler directly inserts code checking shadow memory before
  memory accesses. This is faster than outline (in some workloads
@@ -141,6 +151,7 @@ endchoice
 config KASAN_STACK
bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && 
!COMPILE_TEST
depends on KASAN_GENERIC || KASAN_SW_TAGS
+   depends on !ARCH_DISABLE_KASAN_INLINE
default y if CC_IS_GCC
help
  The LLVM stack address sanitizer has a know problem that
@@ -154,6 +165,9 @@ config KASAN_STACK
  but clang users can still enable it for builds without
  CONFIG_COMPILE_TEST.  On gcc it is assumed to always be safe
  to use and enabled by default.
+ If the architecture disables inline instrumentation, this is
+ also disabled as it adds inline-style instrumentation that
+ is run unconditionally.
 
 config KASAN_SW_TAGS_IDENTIFY
bool "Enable memory corruption identification"
-- 
2.30.2

[PATCH v14 0/4] KASAN core changes for ppc64 radix KASAN

2021-06-17 Thread Daniel Axtens

Building on the work of Christophe, Aneesh and Balbir, I've ported
KASAN to 64-bit Book3S kernels running on the Radix MMU. I've been
trying this for a while, but we keep having collisions between the
kasan code in the mm tree and the code I want to put in to the ppc
tree.

This series just contains the kasan core changes that we need. These
can go in via the mm tree. I will then propose the powerpc changes for
a later cycle. (The most recent RFC for the powerpc changes is in the
v12 series at
https://lore.kernel.org/linux-mm/20210615014705.2234866-1-...@axtens.net/
)

v14 applies to next-20210611. There should be no noticeable changes to
other platforms.

Changes since v13: move the MAX_PTR_PER_* definitions out of kasan and
into pgtable.h. Add a build time error to hopefully prevent any
confusion about when the new hook is applicable. Thanks Marco and
Christophe.

Changes since v12: respond to Marco's review comments - clean up the
help for ARCH_DISABLE_KASAN_INLINE, and add an arch readiness check to
the new granule poisioning function. Thanks Marco.

Daniel Axtens (4):
  kasan: allow an architecture to disable inline instrumentation
  kasan: allow architectures to provide an outline readiness check
  mm: define default MAX_PTRS_PER_* in include/pgtable.h
  kasan: use MAX_PTRS_PER_* for early shadow tables

Re: [PATCH v2 0/4] Add perf interface to expose nvdimm

2021-06-17 Thread kajoljain




On 6/16/21 4:25 PM, Nageswara Sastry wrote:
> 
> 
>> On 14-Jun-2021, at 10:53 AM, Kajol Jain  wrote:
>>
>> Patchset adds performance stats reporting support for nvdimm.
>> Added interface includes support for pmu register/unregister
>> functions. A structure is added called nvdimm_pmu to be used for
>> adding arch/platform specific data such as supported events, cpumask
>> pmu event functions like event_init/add/read/del.
>> User could use the standard perf tool to access perf
>> events exposed via pmu.
>>
>> Added implementation to expose IBM pseries platform nmem*
>> device performance stats using this interface.
>> ...
>>
>> Patch1:
>>Introduces the nvdimm_pmu structure
>> Patch2:
>>  Adds common interface to add arch/platform specific data
>>  includes supported events, pmu event functions. It also
>>  adds code for cpu hotplug support.
>> Patch3:
>>Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
>>nmem* pmu. It fills in the nvdimm_pmu structure with event attrs
>>cpumask andevent functions and then registers the pmu by adding
>>callbacks to register_nvdimm_pmu.
>> Patch4:
>>Sysfs documentation patch
> 
> Tested with the following scenarios:
> 1. Check dmesg for nmem PMU registered messages.
> 2. Listed nmem events using 'perf list and perf list nmem'
> 3. Ran 'perf stat' with single event, grouping events, events from same pmu,
>different pmu and invalid events
> 4. Read from sysfs files, Writing in to sysfs files
> 5. While running nmem events with perf stat, offline cpu from the 
> nmem?/cpumask
> 
> While running the above functionality worked as expected, no error messages 
> seen
> in dmesg.
> 
> Tested-by: Nageswara R Sastry 

Hi Nageswara,
 Thanks for testing the patch-set.
There is a nit change which need to be done in patch 4(Documentation patch).
We need to update nvdimm mailing list from linux-nvd...@lists.01.org to
nvd...@lists.linux.dev.
I will make this change and send a new patch-set with your tested-by tag.

Thanks,
Kajol Jain

> 
>>
>> Changelog
>> ---
>> PATCH v1 -> PATCH v2
>> - Fix hotplug code by adding pmu migration call
>>  incase current designated cpu got offline. As
>>  pointed by Peter Zijlstra.
>>
>> - Removed the retun -1 part from cpu hotplug offline
>>  function.
>>
>> - Link to the previous patchset : https://lkml.org/lkml/2021/6/8/500
>> ---
>> Kajol Jain (4):
>>  drivers/nvdimm: Add nvdimm pmu structure
>>  drivers/nvdimm: Add perf interface to expose nvdimm performance stats
>>  powerpc/papr_scm: Add perf interface support
>>  powerpc/papr_scm: Document papr_scm sysfs event format entries
>>
>> Documentation/ABI/testing/sysfs-bus-papr-pmem |  31 ++
>> arch/powerpc/include/asm/device.h |   5 +
>> arch/powerpc/platforms/pseries/papr_scm.c | 365 ++
>> drivers/nvdimm/Makefile   |   1 +
>> drivers/nvdimm/nd_perf.c  | 230 +++
>> include/linux/nd.h|  46 +++
>> 6 files changed, 678 insertions(+)
>> create mode 100644 drivers/nvdimm/nd_perf.c
>>
> Thanks and Regards,
> R.Nageswara Sastry
> 
>>
>

Re: [PATCH v13 00/12] Restricted DMA

2021-06-17 Thread Claire Chang

v13: https://lore.kernel.org/patchwork/cover/1448001/

[PATCH v13 12/12] of: Add plumbing for restricted DMA pool

2021-06-17 Thread Claire Chang

If a device is not behind an IOMMU, we look up the device node and set
up the restricted DMA when the restricted-dma-pool is presented.

Signed-off-by: Claire Chang 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 drivers/of/address.c| 33 +
 drivers/of/device.c |  3 +++
 drivers/of/of_private.h |  6 ++
 3 files changed, 42 insertions(+)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 73ddf2540f3f..cdf700fba5c4 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1022,6 +1023,38 @@ int of_dma_get_range(struct device_node *np, const 
struct bus_dma_region **map)
of_node_put(node);
return ret;
 }
+
+int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
+{
+   struct device_node *node, *of_node = dev->of_node;
+   int count, i;
+
+   count = of_property_count_elems_of_size(of_node, "memory-region",
+   sizeof(u32));
+   /*
+* If dev->of_node doesn't exist or doesn't contain memory-region, try
+* the OF node having DMA configuration.
+*/
+   if (count <= 0) {
+   of_node = np;
+   count = of_property_count_elems_of_size(
+   of_node, "memory-region", sizeof(u32));
+   }
+
+   for (i = 0; i < count; i++) {
+   node = of_parse_phandle(of_node, "memory-region", i);
+   /*
+* There might be multiple memory regions, but only one
+* restricted-dma-pool region is allowed.
+*/
+   if (of_device_is_compatible(node, "restricted-dma-pool") &&
+   of_device_is_available(node))
+   return of_reserved_mem_device_init_by_idx(dev, of_node,
+ i);
+   }
+
+   return 0;
+}
 #endif /* CONFIG_HAS_DMA */
 
 /**
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 6cb86de404f1..e68316836a7a 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -165,6 +165,9 @@ int of_dma_configure_id(struct device *dev, struct 
device_node *np,
 
arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
 
+   if (!iommu)
+   return of_dma_set_restricted_buffer(dev, np);
+
return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure_id);
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index d9e6a324de0a..25cebbed5f02 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -161,12 +161,18 @@ struct bus_dma_region;
 #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA)
 int of_dma_get_range(struct device_node *np,
const struct bus_dma_region **map);
+int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np);
 #else
 static inline int of_dma_get_range(struct device_node *np,
const struct bus_dma_region **map)
 {
return -ENODEV;
 }
+static inline int of_dma_set_restricted_buffer(struct device *dev,
+  struct device_node *np)
+{
+   return -ENODEV;
+}
 #endif
 
 #endif /* _LINUX_OF_PRIVATE_H */
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 11/12] dt-bindings: of: Add restricted DMA pool

2021-06-17 Thread Claire Chang

Introduce the new compatible string, restricted-dma-pool, for restricted
DMA. One can specify the address and length of the restricted DMA memory
region by restricted-dma-pool in the reserved-memory node.

Signed-off-by: Claire Chang 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 .../reserved-memory/reserved-memory.txt   | 36 +--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git 
a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt 
b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
index e8d3096d922c..39b5f4c5a511 100644
--- a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
+++ b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
@@ -51,6 +51,23 @@ compatible (optional) - standard definition
   used as a shared pool of DMA buffers for a set of devices. It can
   be used by an operating system to instantiate the necessary pool
   management subsystem if necessary.
+- restricted-dma-pool: This indicates a region of memory meant to be
+  used as a pool of restricted DMA buffers for a set of devices. The
+  memory region would be the only region accessible to those devices.
+  When using this, the no-map and reusable properties must not be set,
+  so the operating system can create a virtual mapping that will be 
used
+  for synchronization. The main purpose for restricted DMA is to
+  mitigate the lack of DMA access control on systems without an IOMMU,
+  which could result in the DMA accessing the system memory at
+  unexpected times and/or unexpected addresses, possibly leading to 
data
+  leakage or corruption. The feature on its own provides a basic level
+  of protection against the DMA overwriting buffer contents at
+  unexpected times. However, to protect against general data leakage 
and
+  system memory corruption, the system needs to provide way to lock 
down
+  the memory access, e.g., MPU. Note that since coherent allocation
+  needs remapping, one must set up another device coherent pool by
+  shared-dma-pool and use dma_alloc_from_dev_coherent instead for 
atomic
+  coherent allocation.
 - vendor specific string in the form ,[-]
 no-map (optional) - empty property
 - Indicates the operating system must not create a virtual mapping
@@ -85,10 +102,11 @@ memory-region-names (optional) - a list of names, one for 
each corresponding
 
 Example
 ---
-This example defines 3 contiguous regions are defined for Linux kernel:
+This example defines 4 contiguous regions for Linux kernel:
 one default of all device drivers (named linux,cma@7200 and 64MiB in size),
-one dedicated to the framebuffer device (named framebuffer@7800, 8MiB), and
-one for multimedia processing (named multimedia-memory@7700, 64MiB).
+one dedicated to the framebuffer device (named framebuffer@7800, 8MiB),
+one for multimedia processing (named multimedia-memory@7700, 64MiB), and
+one for restricted dma pool (named restricted_dma_reserved@0x5000, 64MiB).
 
 / {
#address-cells = <1>;
@@ -120,6 +138,11 @@ one for multimedia processing (named 
multimedia-memory@7700, 64MiB).
compatible = "acme,multimedia-memory";
reg = <0x7700 0x400>;
};
+
+   restricted_dma_reserved: restricted_dma_reserved {
+   compatible = "restricted-dma-pool";
+   reg = <0x5000 0x400>;
+   };
};
 
/* ... */
@@ -138,4 +161,11 @@ one for multimedia processing (named 
multimedia-memory@7700, 64MiB).
memory-region = <_reserved>;
/* ... */
};
+
+   pcie_device: pcie_device@0,0 {
+   reg = <0x8301 0x0 0x 0x0 0x0010
+  0x8301 0x0 0x0010 0x0 0x0010>;
+   memory-region = <_dma_reserved>;
+   /* ... */
+   };
 };
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 10/12] swiotlb: Add restricted DMA pool initialization

2021-06-17 Thread Claire Chang

Add the initialization function to create restricted DMA pools from
matching reserved-memory nodes.

Regardless of swiotlb setting, the restricted DMA pool is preferred if
available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 include/linux/swiotlb.h |  3 +-
 kernel/dma/Kconfig  | 14 
 kernel/dma/swiotlb.c| 76 +
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a73fad460162..175b6c113ed8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force;
  * range check to see if the memory was in fact allocated by this
  * API.
  * @nslabs:The number of IO TLB blocks (in groups of 64) between @start and
- * @end. This is command line adjustable via setup_io_tlb_npages.
+ * @end. For default swiotlb, this is command line adjustable via
+ * setup_io_tlb_npages.
  * @used:  The number of used IO TLB block.
  * @list:  The free list describing the number of free entries available
  * from each index.
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..3e961dc39634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,20 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
 
+config DMA_RESTRICTED_POOL
+   bool "DMA Restricted Pool"
+   depends on OF && OF_RESERVED_MEM
+   select SWIOTLB
+   help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ 

+ and .
+ If unsure, say "n".
+
 #
 # Should be selected if we can mmap non-coherent mappings to userspace.
 # The only thing that is really required is a way to set an uncached bit
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6499cfbfe95f..d4099f03b2f0 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
 #ifdef CONFIG_DEBUG_FS
 #include 
 #endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include 
+#include 
+#include 
+#include 
+#include 
+#endif
 
 #include 
 #include 
@@ -736,4 +743,73 @@ bool swiotlb_free(struct device *dev, struct page *page, 
size_t size)
return true;
 }
 
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+   struct device *dev)
+{
+   struct io_tlb_mem *mem = rmem->priv;
+   unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+   /*
+* Since multiple devices can share the same pool, the private data,
+* io_tlb_mem struct, will be initialized by the first device attached
+* to it.
+*/
+   if (!mem) {
+   mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL);
+   if (!mem)
+   return -ENOMEM;
+
+   swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+   mem->force_bounce = true;
+   mem->for_alloc = true;
+   set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+rmem->size >> PAGE_SHIFT);
+
+   rmem->priv = mem;
+
+   if (IS_ENABLED(CONFIG_DEBUG_FS)) {
+   mem->debugfs =
+   debugfs_create_dir(rmem->name, debugfs_dir);
+   swiotlb_create_debugfs_files(mem);
+   }
+   }
+
+   dev->dma_io_tlb_mem = mem;
+
+   return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+   struct device *dev)
+{
+   dev->dma_io_tlb_mem = io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+   .device_init = rmem_swiotlb_device_init,
+   .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+   unsigned long node = rmem->fdt_node;
+
+   if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+   of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+   of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+   of_get_flat_dt_prop(node, "no-map", NULL))
+   return -EINVAL;
+
+   if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base {
+   pr_err("Restricted DMA pool must be accessible within the 
linear mapping.");
+   return

[PATCH v13 09/12] swiotlb: Add restricted DMA alloc/free support

2021-06-17 Thread Claire Chang

Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to
support the memory allocation from restricted DMA pool.

The restricted DMA pool is preferred if available.

Note that since coherent allocation needs remapping, one must set up
another device coherent pool by shared-dma-pool and use
dma_alloc_from_dev_coherent instead for atomic coherent allocation.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 include/linux/swiotlb.h | 26 ++
 kernel/dma/direct.c | 49 +++--
 kernel/dma/swiotlb.c| 38 ++--
 3 files changed, 99 insertions(+), 14 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 8d8855c77d9a..a73fad460162 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -85,6 +85,7 @@ extern enum swiotlb_force swiotlb_force;
  * @debugfs:   The dentry to debugfs.
  * @late_alloc:%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
+ * @for_alloc:  %true if the pool is used for memory allocation
  */
 struct io_tlb_mem {
phys_addr_t start;
@@ -96,6 +97,7 @@ struct io_tlb_mem {
struct dentry *debugfs;
bool late_alloc;
bool force_bounce;
+   bool for_alloc;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
@@ -156,4 +158,28 @@ static inline void swiotlb_adjust_size(unsigned long size)
 extern void swiotlb_print_info(void);
 extern void swiotlb_set_max_segment(unsigned int);
 
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size);
+bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+   return dev->dma_io_tlb_mem->for_alloc;
+}
+#else
+static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+   return NULL;
+}
+static inline bool swiotlb_free(struct device *dev, struct page *page,
+   size_t size)
+{
+   return false;
+}
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+   return false;
+}
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a92465b4eb12..2de33e5d302b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+   size_t size)
+{
+   if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+   swiotlb_free(dev, page, size))
+   return;
+   dma_free_contiguous(dev, page, size);
+}
+
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
 {
@@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device 
*dev, size_t size,
 
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
   _limit);
+   if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
+   is_swiotlb_for_alloc(dev)) {
+   page = swiotlb_alloc(dev, size);
+   if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+   __dma_direct_free_pages(dev, page, size);
+   return NULL;
+   }
+   return page;
+   }
+
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
gfp |= __GFP_NOWARN;
 
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-   !force_dma_unencrypted(dev)) {
+   !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
@@ -155,18 +174,23 @@ void *dma_direct_alloc(struct device *dev, size_t size,
}
 
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   !dev_is_dma_coherent(dev))
+   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
+   !is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 
/*
 * Remapping or decrypting memory may block. If either is required and
 * we can't block, allocate the memory from the atomic pools.
+* If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
+*

[PATCH v13 08/12] swiotlb: Refactor swiotlb_tbl_unmap_single

2021-06-17 Thread Claire Chang

Add a new function, swiotlb_release_slots, to make the code reusable for
supporting different bounce buffer pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 kernel/dma/swiotlb.c | 35 ---
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 89049d021d0d..ff09341bb9f5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -556,27 +556,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
return tlb_addr;
 }
 
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 {
-   struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem;
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
-   unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+   unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
int count, i;
 
-   /*
-* First, sync the memory before unmapping the entry
-*/
-   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-   (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-   swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
/*
 * Return the buffer to the free list by setting the corresponding
 * entries to indicate the number of contiguous entries available.
@@ -611,6 +599,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, 
phys_addr_t tlb_addr,
spin_unlock_irqrestore(>lock, flags);
 }
 
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+   /*
+* First, sync the memory before unmapping the entry
+*/
+   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+   (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+   swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+   swiotlb_release_slots(dev, tlb_addr);
+}
+
 void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
 {
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 07/12] swiotlb: Move alloc_size to swiotlb_find_slots

2021-06-17 Thread Claire Chang

Rename find_slots to swiotlb_find_slots and move the maintenance of
alloc_size to it for better code reusability later.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 kernel/dma/swiotlb.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 13891d5de8c9..89049d021d0d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -432,8 +432,8 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, 
unsigned int index)
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
  */
-static int find_slots(struct device *dev, phys_addr_t orig_addr,
-   size_t alloc_size)
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size)
 {
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
@@ -488,8 +488,11 @@ static int find_slots(struct device *dev, phys_addr_t 
orig_addr,
return -1;
 
 found:
-   for (i = index; i < index + nslots; i++)
+   for (i = index; i < index + nslots; i++) {
mem->slots[i].list = 0;
+   mem->slots[i].alloc_size =
+   alloc_size - ((i - index) << IO_TLB_SHIFT);
+   }
for (i = index - 1;
 io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
 mem->slots[i].list; i--)
@@ -530,7 +533,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
 
-   index = find_slots(dev, orig_addr, alloc_size + offset);
+   index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -544,11 +547,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
 * This is needed when we sync the memory.  Then we sync the buffer if
 * needed.
 */
-   for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+   for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
-   mem->slots[index + i].alloc_size =
-   alloc_size - (i << IO_TLB_SHIFT);
-   }
tlb_addr = slot_addr(mem->start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 06/12] swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing

2021-06-17 Thread Claire Chang

Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and
use it to determine whether to bounce the data or not. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   | 11 +++
 kernel/dma/direct.c   |  2 +-
 kernel/dma/direct.h   |  2 +-
 kernel/dma/swiotlb.c  |  4 
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 0c6ed09f8513..4730a146fa35 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -369,7 +369,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, 
struct page *page,
if (dma_capable(dev, dev_addr, size, true) &&
!range_straddles_page_boundary(phys, size) &&
!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
-   swiotlb_force != SWIOTLB_FORCE)
+   !is_swiotlb_force_bounce(dev))
goto done;
 
/*
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index dd1c30a83058..8d8855c77d9a 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force;
  * unmap calls.
  * @debugfs:   The dentry to debugfs.
  * @late_alloc:%true if allocated using the page allocator
+ * @force_bounce: %true if swiotlb bouncing is forced
  */
 struct io_tlb_mem {
phys_addr_t start;
@@ -94,6 +95,7 @@ struct io_tlb_mem {
spinlock_t lock;
struct dentry *debugfs;
bool late_alloc;
+   bool force_bounce;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
@@ -109,6 +111,11 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
return mem && paddr >= mem->start && paddr < mem->end;
 }
 
+static inline bool is_swiotlb_force_bounce(struct device *dev)
+{
+   return dev->dma_io_tlb_mem->force_bounce;
+}
+
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
@@ -120,6 +127,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
 {
return false;
 }
+static inline bool is_swiotlb_force_bounce(struct device *dev)
+{
+   return false;
+}
 static inline void swiotlb_exit(void)
 {
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 7a88c34d0867..a92465b4eb12 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -496,7 +496,7 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 {
/* If SWIOTLB is active, use its maximum mapping size */
if (is_swiotlb_active(dev) &&
-   (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+   (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
 }
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 13e9e7158d94..4632b0f4f72e 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-   if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+   if (is_swiotlb_force_bounce(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
 
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 409694d7a8ad..13891d5de8c9 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -179,6 +179,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem 
*mem, phys_addr_t start,
mem->end = mem->start + bytes;
mem->index = 0;
mem->late_alloc = late_alloc;
+
+   if (swiotlb_force == SWIOTLB_FORCE)
+   mem->force_bounce = true;
+
spin_lock_init(>lock);
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 05/12] swiotlb: Update is_swiotlb_active to add a struct device argument

2021-06-17 Thread Claire Chang

Update is_swiotlb_active to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c| 2 +-
 drivers/pci/xen-pcifront.c   | 2 +-
 include/linux/swiotlb.h  | 4 ++--
 kernel/dma/direct.c  | 2 +-
 kernel/dma/swiotlb.c | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c 
b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index a9d65fc8aa0e..4b7afa0fc85d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct 
drm_i915_gem_object *obj)
 
max_order = MAX_ORDER;
 #ifdef CONFIG_SWIOTLB
-   if (is_swiotlb_active()) {
+   if (is_swiotlb_active(obj->base.dev->dev)) {
unsigned int max_segment;
 
max_segment = swiotlb_max_segment();
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c 
b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index 9662522aa066..be15bfd9e0ee 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
}
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-   need_swiotlb = is_swiotlb_active();
+   need_swiotlb = is_swiotlb_active(dev->dev);
 #endif
 
ret = ttm_bo_device_init(>ttm.bdev, _bo_driver,
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index b7a8f3a1921f..0d56985bfe81 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct 
pcifront_device *pdev)
 
spin_unlock(_dev_lock);
 
-   if (!err && !is_swiotlb_active()) {
+   if (!err && !is_swiotlb_active(>xdev->dev)) {
err = pci_xen_swiotlb_init_late();
if (err)
dev_err(>xdev->dev, "Could not setup SWIOTLB!\n");
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d1f3d95881cd..dd1c30a83058 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
-bool is_swiotlb_active(void);
+bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
@@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device 
*dev)
return SIZE_MAX;
 }
 
-static inline bool is_swiotlb_active(void)
+static inline bool is_swiotlb_active(struct device *dev)
 {
return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 84c9feb5474a..7a88c34d0867 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 size_t dma_direct_max_mapping_size(struct device *dev)
 {
/* If SWIOTLB is active, use its maximum mapping size */
-   if (is_swiotlb_active() &&
+   if (is_swiotlb_active(dev) &&
(dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index de79e9437030..409694d7a8ad 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -664,9 +664,9 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
 {
-   return io_tlb_default_mem != NULL;
+   return dev->dma_io_tlb_mem != NULL;
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 04/12] swiotlb: Update is_swiotlb_buffer to add a struct device argument

2021-06-17 Thread Claire Chang

Update is_swiotlb_buffer to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 drivers/iommu/dma-iommu.c | 12 ++--
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   |  7 ---
 kernel/dma/direct.c   |  6 +++---
 kernel/dma/direct.h   |  6 +++---
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3087d9fa6065..10997ef541f8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -507,7 +507,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, 
dma_addr_t dma_addr,
 
__iommu_dma_unmap(dev, dma_addr, size);
 
-   if (unlikely(is_swiotlb_buffer(phys)))
+   if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
@@ -578,7 +578,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device 
*dev, phys_addr_t phys,
}
 
iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
return iova;
 }
@@ -749,7 +749,7 @@ static void iommu_dma_sync_single_for_cpu(struct device 
*dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);
 
-   if (is_swiotlb_buffer(phys))
+   if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
@@ -762,7 +762,7 @@ static void iommu_dma_sync_single_for_device(struct device 
*dev,
return;
 
phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-   if (is_swiotlb_buffer(phys))
+   if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_device(dev, phys, size, dir);
 
if (!dev_is_dma_coherent(dev))
@@ -783,7 +783,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
-   if (is_swiotlb_buffer(sg_phys(sg)))
+   if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
sg->length, dir);
}
@@ -800,7 +800,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
return;
 
for_each_sg(sgl, sg, nelems, i) {
-   if (is_swiotlb_buffer(sg_phys(sg)))
+   if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_device(dev, sg_phys(sg),
   sg->length, dir);
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 4c89afc0df62..0c6ed09f8513 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
 * in our domain. Therefore _only_ check address within our domain.
 */
if (pfn_valid(PFN_DOWN(paddr)))
-   return is_swiotlb_buffer(paddr);
+   return is_swiotlb_buffer(dev, paddr);
return 0;
 }
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 216854a5e513..d1f3d95881cd 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_SWIOTLB_H
 #define __LINUX_SWIOTLB_H
 
+#include 
 #include 
 #include 
 #include 
@@ -101,9 +102,9 @@ struct io_tlb_mem {
 };
 extern struct io_tlb_mem *io_tlb_default_mem;
 
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 
return mem && paddr >= mem->start && paddr < mem->end;
 }
@@ -115,7 +116,7 @@ bool is_swiotlb_active(void);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..84c9feb5474a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
-   if (unlikely(is_swiotlb_buffer(paddr)))
+   if (unlikely(is_swiotlb_buffer(dev, paddr)))

[PATCH v13 03/12] swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used

2021-06-17 Thread Claire Chang

Always have the pointer to the swiotlb pool used in struct device. This
could help simplify the code for other pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 drivers/base/core.c| 4 
 include/linux/device.h | 4 
 kernel/dma/swiotlb.c   | 8 
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f29839382f81..cb3123e3954d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include  /* for dma_default_coherent */
 
@@ -2736,6 +2737,9 @@ void device_initialize(struct device *dev)
 defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
dev->dma_coherent = dma_default_coherent;
 #endif
+#ifdef CONFIG_SWIOTLB
+   dev->dma_io_tlb_mem = io_tlb_default_mem;
+#endif
 }
 EXPORT_SYMBOL_GPL(device_initialize);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index ba660731bd25..240d652a0696 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -416,6 +416,7 @@ struct dev_links_info {
  * @dma_pools: Dma pools (if dma'ble device).
  * @dma_mem:   Internal for coherent mem override.
  * @cma_area:  Contiguous memory area for dma allocations
+ * @dma_io_tlb_mem: Pointer to the swiotlb pool used.  Not for driver use.
  * @archdata:  For arch-specific additions.
  * @of_node:   Associated device tree node.
  * @fwnode:Associated device node supplied by platform firmware.
@@ -518,6 +519,9 @@ struct device {
 #ifdef CONFIG_DMA_CMA
struct cma *cma_area;   /* contiguous memory area for dma
   allocations */
+#endif
+#ifdef CONFIG_SWIOTLB
+   struct io_tlb_mem *dma_io_tlb_mem;
 #endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2dba659a1e73..de79e9437030 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -340,7 +340,7 @@ void __init swiotlb_exit(void)
 static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t 
size,
   enum dma_data_direction dir)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
unsigned int offset = (tlb_addr - mem->start) & (IO_TLB_SIZE - 1);
phys_addr_t orig_addr = mem->slots[index].orig_addr;
@@ -431,7 +431,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, 
unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -508,7 +508,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int i;
int index;
@@ -559,7 +559,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, 
phys_addr_t tlb_addr,
  size_t mapping_size, enum dma_data_direction dir,
  unsigned long attrs)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem;
unsigned long flags;
unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 02/12] swiotlb: Refactor swiotlb_create_debugfs

2021-06-17 Thread Claire Chang

Split the debugfs creation to make the code reusable for supporting
different bounce buffer pools.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 kernel/dma/swiotlb.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 47bb2a766798..2dba659a1e73 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -671,19 +671,26 @@ bool is_swiotlb_active(void)
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
+static struct dentry *debugfs_dir;
 
-static int __init swiotlb_create_debugfs(void)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
-
-   if (!mem)
-   return 0;
-   mem->debugfs = debugfs_create_dir("swiotlb", NULL);
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, >nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, >used);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+   struct io_tlb_mem *mem = io_tlb_default_mem;
+
+   debugfs_dir = debugfs_create_dir("swiotlb", NULL);
+   if (mem) {
+   mem->debugfs = debugfs_dir;
+   swiotlb_create_debugfs_files(mem);
+   }
return 0;
 }
 
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
 
 #endif
-- 
2.32.0.288.g62a8d224e6-goog

[PATCH v13 01/12] swiotlb: Refactor swiotlb init functions

2021-06-17 Thread Claire Chang

Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct
initialization to make the code reusable.

Signed-off-by: Claire Chang 
Reviewed-by: Christoph Hellwig 
Tested-by: Stefano Stabellini 
Tested-by: Will Deacon 
---
 kernel/dma/swiotlb.c | 50 ++--
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 52e2ac526757..47bb2a766798 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -168,9 +168,28 @@ void __init swiotlb_update_mem_attributes(void)
memset(vaddr, 0, bytes);
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+   unsigned long nslabs, bool late_alloc)
 {
+   void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+
+   mem->nslabs = nslabs;
+   mem->start = start;
+   mem->end = mem->start + bytes;
+   mem->index = 0;
+   mem->late_alloc = late_alloc;
+   spin_lock_init(>lock);
+   for (i = 0; i < mem->nslabs; i++) {
+   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+   mem->slots[i].alloc_size = 0;
+   }
+   memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
struct io_tlb_mem *mem;
size_t alloc_size;
 
@@ -186,16 +205,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs, int verbose)
if (!mem)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
  __func__, alloc_size, PAGE_SIZE);
-   mem->nslabs = nslabs;
-   mem->start = __pa(tlb);
-   mem->end = mem->start + bytes;
-   mem->index = 0;
-   spin_lock_init(>lock);
-   for (i = 0; i < mem->nslabs; i++) {
-   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-   mem->slots[i].alloc_size = 0;
-   }
+
+   swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
 
io_tlb_default_mem = mem;
if (verbose)
@@ -282,8 +293,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
-   unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
struct io_tlb_mem *mem;
+   unsigned long bytes = nslabs << IO_TLB_SHIFT;
 
if (swiotlb_force == SWIOTLB_NO_FORCE)
return 0;
@@ -297,20 +308,9 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
if (!mem)
return -ENOMEM;
 
-   mem->nslabs = nslabs;
-   mem->start = virt_to_phys(tlb);
-   mem->end = mem->start + bytes;
-   mem->index = 0;
-   mem->late_alloc = 1;
-   spin_lock_init(>lock);
-   for (i = 0; i < mem->nslabs; i++) {
-   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-   mem->slots[i].alloc_size = 0;
-   }
-
+   memset(mem, 0, sizeof(*mem));
+   swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
-   memset(tlb, 0, bytes);
 
io_tlb_default_mem = mem;
swiotlb_print_info();
-- 
2.32.0.288.g62a8d224e6-goog

< 1 2 3 >

101 - 200 of 202 matches

Mail list logo