date:20200716

Re: [PATCH v2 2/4] powerpc/sstep: support emulation for vsx vector paired storage access instructions

2020-07-16 Thread Ravi Bangoria


Hi Bala,


@@ -709,6 +722,8 @@ void emulate_vsx_load(struct instruction_op *op, union 
vsx_reg *reg,
reg->d[0] = reg->d[1] = 0;
  
  	switch (op->element_size) {

+   case 32:
+   /* [p]lxvp[x] or [p]stxvp[x] */


This function does not emulate stvxp 


case 16:
/* whole vector; lxv[x] or lxvl[l] */
if (size == 0)
@@ -717,7 +732,7 @@ void emulate_vsx_load(struct instruction_op *op, union 
vsx_reg *reg,
if (IS_LE && (op->vsx_flags & VSX_LDLEFT))
rev = !rev;
if (rev)
-   do_byte_reverse(reg, 16);
+   do_byte_reverse(reg, size);
break;
case 8:
/* scalar loads, lxvd2x, lxvdsx */
@@ -793,6 +808,22 @@ void emulate_vsx_store(struct instruction_op *op, const 
union vsx_reg *reg,
size = GETSIZE(op->type);
  
  	switch (op->element_size) {

+   case 32:
+   /* [p]lxvp[x] or [p]stxvp[x] */


This function does not emulate lxvp 


+   if (size == 0)
+   break;
+   if (IS_LE && (op->vsx_flags & VSX_LDLEFT))
+   rev = !rev;


Why this if condition  ?


+   if (rev) {
+   /* reverse 32 bytes */
+   buf.d[0] = byterev_8(reg->d[3]);
+   buf.d[1] = byterev_8(reg->d[2]);
+   buf.d[2] = byterev_8(reg->d[1]);
+   buf.d[3] = byterev_8(reg->d[0]);
+   reg = &buf;
+   }
+   memcpy(mem, reg, size);
+   break;
case 16:
/* stxv, stxvx, stxvl, stxvll */
if (size == 0)
@@ -861,28 +892,33 @@ static nokprobe_inline int do_vsx_load(struct 
instruction_op *op,
   bool cross_endian)
  {
int reg = op->reg;
-   u8 mem[16];
+   int i, nr_vsx_regs;
+   u8 mem[32];
union vsx_reg buf;
int size = GETSIZE(op->type);
  
  	if (!address_ok(regs, ea, size) || copy_mem_in(mem, ea, size, regs))

return -EFAULT;
  
+	nr_vsx_regs = size / sizeof(__vector128);

emulate_vsx_load(op, &buf, mem, cross_endian);
preempt_disable();
if (reg < 32) {
/* FP regs + extensions */
if (regs->msr & MSR_FP) {
-   load_vsrn(reg, &buf);
+   for (i = 0; i < nr_vsx_regs; i++)
+   load_vsrn(reg + i, &buf.v[i]);
} else {
current->thread.fp_state.fpr[reg][0] = buf.d[0];
current->thread.fp_state.fpr[reg][1] = buf.d[1];


Should we change else part as well?


}
} else {
if (regs->msr & MSR_VEC)
-   load_vsrn(reg, &buf);
+   for (i = 0; i < nr_vsx_regs; i++)
+   load_vsrn(reg + i, &buf.v[i]);
+


Unnecessary line.


else
-   current->thread.vr_state.vr[reg - 32] = buf.v;
+   current->thread.vr_state.vr[reg - 32] = buf.v[0];


Same here. else part, should we add:

if (vsx 32 byte)
current->thread.vr_state.vr[reg - 32 + 1] = buf.v[1];


}
preempt_enable();
return 0;
@@ -893,27 +929,31 @@ static nokprobe_inline int do_vsx_store(struct 
instruction_op *op,
bool cross_endian)
  {
int reg = op->reg;
-   u8 mem[16];
+   int i, nr_vsx_regs;
+   u8 mem[32];
union vsx_reg buf;
int size = GETSIZE(op->type);
  
  	if (!address_ok(regs, ea, size))

return -EFAULT;
  
+	nr_vsx_regs = size / sizeof(__vector128);

preempt_disable();
if (reg < 32) {
/* FP regs + extensions */
if (regs->msr & MSR_FP) {
-   store_vsrn(reg, &buf);
+   for (i = 0; i < nr_vsx_regs; i++)
+   store_vsrn(reg + i, &buf.v[i]);
} else {
buf.d[0] = current->thread.fp_state.fpr[reg][0];
buf.d[1] = current->thread.fp_state.fpr[reg][1];
}
} else {
if (regs->msr & MSR_VEC)
-   store_vsrn(reg, &buf);
+   for (i = 0; i < nr_vsx_regs; i++)
+   store_vsrn(reg + i, &buf.v[i]);
else
-   buf.v = current->thread.vr_state.vr[reg - 32];
+   buf.v[0] = current->thread.vr_state.vr[reg - 32];
}
preempt_enable();
emulate_vsx_store(op, &buf, mem, cross_endian);



Ravi

[PATCH v4 0/7] Remove default DMA window before creating DDW

2020-07-16 Thread Leonardo Bras

There are some devices in which a hypervisor may only allow 1 DMA window
to exist at a time, and in those cases, a DDW is never created to them,
since the default DMA window keeps using this resource.

LoPAR recommends this procedure:
1. Remove the default DMA window,
2. Query for which configs the DDW can be created,
3. Create a DDW.

Patch #1:
Create defines for outputs of ibm,ddw-applicable, so it's easier to
identify them.

Patch #2:
- After LoPAR level 2.8, there is an extension that can make
ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
order of the outputs, and that can cause some trouble.
- query_ddw() was updated to check how many outputs the
ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
deal correctly with the outputs in both cases.
- This patch looks somehow unrelated to the series, but it can avoid future
problems on DDW creation.

Patch #3 moves the window-removing code from remove_ddw() to
remove_dma_window(), creating a way to delete any DMA window, so it can be
used to delete the default DMA window.

Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
default DMA window before query_ddw(). It also implements a new rtas call
to recover the default DMA window, in case anything fails after it was
removed, and a DDW couldn't be created.

Patch #5 moves the part of iommu_table_free() that does struct iommu_table
cleaning into iommu_table_clean, so we can invoke it separately in
patch #6.

Patch #6:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.

Patch #7:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]

Patch #6 It was tested with a 64GB DDW which did not map the whole
partition (128G). Performance improvement noticed by using the DDW instead
of the default DMA window:

64 thread write throughput: +203.0%
64 thread read throughput: +17.5%
1 thread write throughput: +20.5%
1 thread read throughput: +3.43%
Average write latency: -23.0%
Average read latency: -2.26%

---
Changes since v3:
- Introduces new patch #5, to prepare for an important change in #6
- struct iommu_table was not being updated, so include a way to do this
in patch #6.
- Improved patch #4 based in a suggestion from Alexey, to make code
more easily understandable
- v3 link:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348&state=%2A&archive=both

Changes since v2:
- Change the way ibm,ddw-extensions is accessed, using a proper function
instead of doing this inline everytime it's used.
- Remove previous patch #6, as it doesn't look like it would be useful.
- Add new patch, for changing names from direct* to dma*, as indirect
mapping can be used from now on.
- Fix some typos, corrects some define usage.
- v2 link:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433&state=%2A&archive=both

Changes since v1:
- Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
- Merge aux function query_ddw_out_sz() into query_ddw()
- Merge reset_dma_window() patch (prev. #2) into remove default DMA
window patch (#4).
- Keep device_node *np name instead of using pdn in remove_*()
- Rename 'device_node *pdn' into 'parent' in new functions
- Rename dfl_win to default_win
- Only remove the default DMA window if there is no window available
in first query.
- Check if default DMA window can be restored before removing it.
- Fix 'unitialized use' (found by travis mpe:ci-test)
- New patches #5 and #6
- v1 link:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420&state=%2A&archive=both

Special thanks for Alexey Kardashevskiy, Brian King and
Oliver O'Halloran for the feedback provided!

Leonardo Bras (7):
powerpc/pseries/iommu: Create defines for operations in
ibm,ddw-applicable
powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
powerpc/pseries/iommu: Move window-removing part of remove_ddw into
remove_dma_window
powerpc/pseries/iommu: Remove default DMA window before creating DDW
powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean
powerpc/pseries/iommu: Make use of DDW even if it does not map the
partition
powerpc/pseries/iommu: Rename "direct window" to "dma window"

arch/powerpc/include/asm/iommu.h | 3 +
arch/powerpc/kernel/iommu.c| 45 ++-
arch/powerpc/platforms/pseries/iommu.c | 380 ++---
3 files changed, 313 insertions(+), 115 deletions(-)

--
2.25.4

[PATCH v4 1/7] powerpc/pseries/iommu: Create defines for operations in ibm, ddw-applicable

2020-07-16 Thread Leonardo Bras

Create defines to help handling ibm,ddw-applicable values, avoiding
confusion about the index of given operations.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 43 --
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..ac0d6376bdad 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -39,6 +39,14 @@
 
 #include "pseries.h"
 
+enum {
+   DDW_QUERY_PE_DMA_WIN  = 0,
+   DDW_CREATE_PE_DMA_WIN = 1,
+   DDW_REMOVE_PE_DMA_WIN = 2,
+
+   DDW_APPLICABLE_SIZE
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -771,12 +779,12 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 {
struct dynamic_dma_window_prop *dwp;
struct property *win64;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
int ret = 0;
 
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-&ddw_avail[0], 3);
+&ddw_avail[0], DDW_APPLICABLE_SIZE);
 
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
if (!win64)
@@ -798,15 +806,15 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF successfully cleared tces in window.\n",
 np);
 
-   ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+   ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
pr_warn("%pOF: failed to remove direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 
 delprop:
if (remove_prop)
@@ -889,11 +897,11 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
- cfg_addr, BUID_HI(buid), BUID_LO(buid));
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   cfg_addr, BUID_HI(buid), BUID_LO(buid));
dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
-   BUID_LO(buid), ret);
+   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
+BUID_HI(buid), BUID_LO(buid), ret);
return ret;
 }
 
@@ -920,15 +928,16 @@ static int create_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
 
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
-   ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
-   cfg_addr, BUID_HI(buid), BUID_LO(buid),
-   page_shift, window_shift);
+   ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
+   (u32 *)create, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid), page_shift, window_shift);
} while (rtas_busy_delay(ret));
dev_info(&dev->dev,
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
-   "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
-cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
-window_shift, ret, create->liobn, create->addr_hi, 
create->addr_lo);
+   "(liobn = 0x%x starting addr = %x %x)\n",
+ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
+create->addr_hi, create->addr_lo);
 
return ret;
 }
@@ -996,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
int page_shift;
u64 dma_addr, max_addr;
struct device_node *dn;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
@@ -1029,7 +1038,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * the property is actually in the parent, not the PE
 */
ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
-

[PATCH v4 2/7] powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows

2020-07-16 Thread Leonardo Bras

>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.

This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.

This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.

Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +++---
 1 file changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
DDW_APPLICABLE_SIZE
 };
 
+enum {
+   DDW_EXT_SIZE = 0,
+   DDW_EXT_RESET_DMA_WIN = 1,
+   DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
 /* Dynamic DMA Window support */
 struct ddw_query_response {
u32 windows_available;
-   u32 largest_available_block;
+   u64 largest_available_block;
u32 page_size;
u32 migration_capable;
 };
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
 }
 machine_arch_initcall(pseries, find_existing_ddw_windows);
 
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np:device node from which the extension value is to be 
read.
+ * @extnum:index number of the extension.
+ * @value: pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ * 0 if extension successfully read
+ * -EINVAL if the "ibm,ddw-extensions" does not exist,
+ * -ENODATA if "ibm,ddw-extensions" does not have a value, and
+ * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+  u32 *value)
+{
+   static const char propname[] = "ibm,ddw-extensions";
+   u32 count;
+   int ret;
+
+   ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
+   if (ret)
+   return ret;
+
+   if (count < extnum)
+   return -EOVERFLOW;
+
+   if (!value)
+   value = &count;
+
+   return of_property_read_u32_index(np, propname, extnum, value);
+}
+
 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
-   struct ddw_query_response *query)
+struct ddw_query_response *query,
+struct device_node *parent)
 {
struct device_node *dn;
struct pci_dn *pdn;
-   u32 cfg_addr;
+   u32 cfg_addr, ext_query, query_out[5];
u64 buid;
-   int ret;
+   int ret, out_sz;
+
+   /*
+* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+* output parameters ibm,query-pe-dma-windows will have, ranging from
+* 5 to 6.
+*/
+   ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
+   if (!ret && ext_query == 1)
+   out_sz = 6;
+   else
+   out_sz = 5;
 
/*
 * Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
-   dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
-BUID_HI(buid), BUID_LO(buid), ret);
+   dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned 
%d\n",
+ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), ret);
+
+   switch (out_sz) {
+   case 5:
+   query->windows_available = query_out[0];
+   query->largest_available_block = query_out[1];
+   query->page_size = query_out[2];
+   query->migration_capable = query_out[3];
+

[PATCH v4 3/7] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window

2020-07-16 Thread Leonardo Bras

Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.

It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 45 +++---
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
 {
struct dynamic_dma_window_prop *dwp;
-   struct property *win64;
-   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
-   int ret = 0;
-
-   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-&ddw_avail[0], DDW_APPLICABLE_SIZE);
-
-   win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win64)
-   return;
-
-   if (ret || win64->length < sizeof(*dwp))
-   goto delprop;
+   int ret;
 
-   dwp = win64->value;
+   dwp = win->value;
liobn = (u64)be32_to_cpu(dwp->liobn);
 
/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+   struct property *win;
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
+   int ret = 0;
+
+   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+&ddw_avail[0], DDW_APPLICABLE_SIZE);
+   if (ret)
+   return;
+
+   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   if (!win)
+   return;
+
+   if (win->length >= sizeof(struct dynamic_dma_window_prop))
+   remove_dma_window(np, ddw_avail, win);
+
+   if (!remove_prop)
+   return;
 
-delprop:
-   if (remove_prop)
-   ret = of_remove_property(np, win64);
+   ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
-- 
2.25.4

[PATCH v4 4/7] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-16 Thread Leonardo Bras

On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.

This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.

If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:

Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.

It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 73 +++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..fc8d0555e2e9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
return max_addr;
 }
 
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+   int ret;
+   u32 cfg_addr, reset_dma_win;
+   u64 buid;
+   struct device_node *dn;
+   struct pci_dn *pdn;
+
+   ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
+   if (ret)
+   return;
+
+   dn = pci_device_to_OF_node(dev);
+   pdn = PCI_DN(dn);
+   buid = pdn->phb->buid;
+   cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+   ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid));
+   if (ret)
+   dev_info(&dev->dev,
+"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1090,6 +1122,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
+   bool default_win_removed = false;
 
mutex_lock(&direct_window_init_mutex);
 
@@ -1133,14 +1166,38 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (ret != 0)
goto out_failed;
 
+   /*
+* If there is no window available, remove the default DMA window,
+* if it's present. This will make all the resources available to the
+* new DDW window.
+* If anything fails after this, we need to restore it, so also check
+* for extensions presence.
+*/
if (query.windows_available == 0) {
-   /*
-* no additional windows are available for this device.
-* We might be able to reallocate the existing window,
-* trading in for a larger page size.
-*/
-   dev_dbg(&dev->dev, "no free dynamic windows");
-   goto out_failed;
+   struct property *default_win;
+   int reset_win_ext;
+
+   default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+   if (!default_win)
+   goto out_failed;
+
+   reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+   if (reset_win_ext)
+   goto out_failed;
+
+   remove_dma_window(pdn, ddw_avail, default_win);
+   default_win_removed = true;
+
+   /* Query again, to check if the window is available */
+   ret = query_ddw(dev, ddw_avail, &query, pdn);
+   if (ret != 0)
+   goto out_failed;
+
+   if (query.windows_available == 0) {
+   /* no windows are available for this device. */
+   dev_dbg(&dev->dev, "no free dynamic windows");
+   goto out_failed;
+   }
}
if (query.page_size & 4) {
page_shift = 24; /* 16MB */
@@ -1231,6 +1288,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
kfr

[PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-16 Thread Leonardo Bras

Move the part of iommu_table_free() that does struct iommu_table cleaning
into iommu_table_clean, so we can invoke it separately.

This new function is useful for cleaning struct iommu_table before
initializing it again with a new DMA window, without having it freed and
allocated again.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/iommu.c | 30 ++
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 9704f3f76e63..c3242253a4e7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
-static void iommu_table_free(struct kref *kref)
+static void iommu_table_clean(struct iommu_table *tbl)
 {
unsigned long bitmap_sz;
unsigned int order;
-   struct iommu_table *tbl;
-
-   tbl = container_of(kref, struct iommu_table, it_kref);
-
-   if (tbl->it_ops->free)
-   tbl->it_ops->free(tbl);
-
-   if (!tbl->it_map) {
-   kfree(tbl);
-   return;
-   }
 
iommu_table_release_pages(tbl);
 
@@ -763,6 +752,23 @@ static void iommu_table_free(struct kref *kref)
/* free bitmap */
order = get_order(bitmap_sz);
free_pages((unsigned long) tbl->it_map, order);
+}
+
+static void iommu_table_free(struct kref *kref)
+{
+   struct iommu_table *tbl;
+
+   tbl = container_of(kref, struct iommu_table, it_kref);
+
+   if (tbl->it_ops->free)
+   tbl->it_ops->free(tbl);
+
+   if (!tbl->it_map) {
+   kfree(tbl);
+   return;
+   }
+
+   iommu_table_clean(tbl);
 
/* free table */
kfree(tbl);
-- 
2.25.4

[PATCH v4 6/7] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-16 Thread Leonardo Bras

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.

Usually this DDW is bigger than the default DMA window, and it performs
better, so it would be nice to use it instead.

The DDW created will be used for direct mapping by default.
If it's not available, indirect mapping will be used instead.

In this case, it's necessary to update the iommu_table so iommu_alloc()
can use the DDW created. For this, iommu_table_update() is called after a
enable_ddw() when direct DMA is not available.

As there will never have both direct and indirect mappings at the same
time, the same property name can be used for the created DDW.

So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h   |  3 ++
 arch/powerpc/kernel/iommu.c| 15 +
 arch/powerpc/platforms/pseries/iommu.c | 46 +++---
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5032f1593299..dc4480a9d60d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+void iommu_table_update(struct iommu_table *tbl, int nid, unsigned long liobn,
+   unsigned long win_addr, unsigned long page_shift,
+   unsigned long window_shift);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c3242253a4e7..cb0cb572eb0a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -774,6 +774,21 @@ static void iommu_table_free(struct kref *kref)
kfree(tbl);
 }
 
+void iommu_table_update(struct iommu_table *tbl, int nid, unsigned long liobn,
+   unsigned long win_addr, unsigned long page_shift,
+   unsigned long window_shift)
+{
+   iommu_table_clean(tbl);
+
+   /* Update tlb with values from ddw */
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_size = 1 << (window_shift - page_shift);
+
+   iommu_init_table(tbl, nid, 0, 0);
+}
+
 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
 {
if (kref_get_unless_zero(&tbl->it_kref))
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index fc8d0555e2e9..6e1c9d1599d1 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -364,7 +364,7 @@ static LIST_HEAD(direct_window_list);
 static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -823,7 +823,7 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
if (ret)
return;
 
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   win = of_find_property(np, DMA64_PROPNAME, NULL);
if (!win)
return;
 
@@ -869,8 +869,8 @@ static int find_existing_ddw_windows(void)
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+   for_each_node_with_property(pdn, DMA64_PROPNAME) {
+   direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
if (!direct64)
continue;
 
@@ -1210,23 +1210,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
  query.page_size);
goto out_failed;
}
+
/* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
-   dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
- "%llu-sized pages\n", max_addr,  
query.largest_available_block,
- 1ULL << page_shift);
-   goto out_failed;
+   dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu 
%llu-sized page

[PATCH v4 7/7] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2020-07-16 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 100 +
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6e1c9d1599d1..5ca952d966a4 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -339,7 +339,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -359,12 +359,13 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -697,15 +698,18 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-   dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+   dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window != NULL)
break;
}
 
if (dma_window == NULL) {
-   pr_debug("  no ibm,dma-window property !\n");
+   pr_debug("  no %s property !\n", DEFAULT_DMA_WIN);
return;
}
 
@@ -803,11 +807,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -835,26 +839,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove dma window property: %d\n",
np, ret);
 }
 
 static u64 find_existing_ddw(struct device_node *pdn)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
u64 dma_addr = 0;
 
-   spin_lock(&direct_window_list_lock);
+   spin_lock(&dma_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, &direct_window_list, list) {
+   list_for_each_entry(window, &dma_win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   dma64 = window->prop;
+   dma_addr = be64_to_cpu(dma64->dma_base);
break;
}
}
-   spin_unlock(&direct_window_list_lock);
+   spin_unlock(&dma_win_list_lock);
 
return dma_addr;
 }
@@ -863,15 +867,15 @@ static int find_existing_ddw_windows(voi

[powerpc:merge] BUILD SUCCESS 58a4eb09c4aebaaffa8b4517c71543a41539c096

2020-07-16 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: 58a4eb09c4aebaaffa8b4517c71543a41539c096  Automatic merge of 
'master', 'next' and 'fixes' (2020-07-15 23:12)

elapsed time: 1031m

configs tested: 94
configs skipped: 4

The following configs have been built successfully.
More configs may be tested in the coming days.

arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
arc  axs101_defconfig
c6xevmc6457_defconfig
sh kfr2r09-romimage_defconfig
powerpcgamecube_defconfig
arm  lpd270_defconfig
mips  malta_defconfig
riscv   defconfig
c6xevmc6474_defconfig
armclps711x_defconfig
arm   corgi_defconfig
riscvallyesconfig
arm orion5x_defconfig
arm  moxart_defconfig
powerpcamigaone_defconfig
m68k apollo_defconfig
shedosk7705_defconfig
i386  allnoconfig
i386 allyesconfig
i386defconfig
i386  debian-10.3
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
powerpc defconfig
i386 randconfig-a016-20200715
i386 randconfig-a011-20200715
i386 randconfig-a015-20200715
i386 randconfig-a012-20200715
i386 randconfig-a013-20200715
i386 randconfig-a014-20200715
riscv allnoconfig
riscvallmodconfig
s390 allyesconfig
s390  allnoconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
sparc64 defconfig
sparc64   allnoconfig
sparc64  allyesconfig
sparc64  allmodconfig
x86_64rhel-7.6-kselftests
x86_64   rhel-8.3
x86_64  kexec
x86_64   rhel
x86_64lkp
x86_64  fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

RE: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86

2020-07-16 Thread David Laight

From: Benjamin Herrenschmidt
> Sent: 15 July 2020 23:49
> On Wed, 2020-07-15 at 17:12 -0500, Bjorn Helgaas wrote:
> > > I've 'played' with PCIe error handling - without much success.
> > > What might be useful is for a driver that has just read ~0u to
> > > be able to ask 'has there been an error signalled for this device?'.
> >
> > In many cases a driver will know that ~0 is not a valid value for the
> > register it's reading.  But if ~0 *could* be valid, an interface like
> > you suggest could be useful.  I don't think we have anything like that
> > today, but maybe we could.  It would certainly be nice if the PCI core
> > noticed, logged, and cleared errors.  We have some of that for AER,
> > but that's an optional feature, and support for the error bits in the
> > garden-variety PCI_STATUS register is pretty haphazard.  As you note
> > below, this sort of SERR/PERR reporting is frequently hard-wired in
> > ways that takes it out of our purview.
> 
> We do have pci_channel_state (via pci_channel_offline()) which covers
> the cases where the underlying error handling (such as EEH or unplug)
> results in the device being offlined though this tend to be
> asynchronous so it might take a few ~0's before you get it.

On one of my systems I don't think the error TLP from the target
made its way past the first bridge - I could see the error in it's
status registers.
But I couldn't find any of the AER status registers in the root bridge.
So I think you'd need a software poll of the bridge registers to
find out (and clear) the error.

The NMI on the dell system (which is supposed to meet some special
NEBS? server requirements) is just stupid.
Too late to be synchronous and impossible for the OS to handle.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.

2020-07-16 Thread Michal Suchánek

On Wed, Jul 15, 2020 at 07:52:01AM -0400, Nayna Jain wrote:
> The device-tree property to check secure and trusted boot state is
> different for guests(pseries) compared to baremetal(powernv).
> 
> This patch updates the existing is_ppc_secureboot_enabled() and
> is_ppc_trustedboot_enabled() functions to add support for pseries.
> 
> The secureboot and trustedboot state are exposed via device-tree property:
> /proc/device-tree/ibm,secure-boot and /proc/device-tree/ibm,trusted-boot
> 
> The values of ibm,secure-boot under pseries are interpreted as:
  ^^^
> 
> 0 - Disabled
> 1 - Enabled in Log-only mode. This patch interprets this value as
> disabled, since audit mode is currently not supported for Linux.
> 2 - Enabled and enforced.
> 3-9 - Enabled and enforcing; requirements are at the discretion of the
> operating system.
> 
> The values of ibm,trusted-boot under pseries are interpreted as:
   ^^^
These two should be different I suppose?

Thanks

Michal
> 0 - Disabled
> 1 - Enabled
> 
> Signed-off-by: Nayna Jain 
> Reviewed-by: Daniel Axtens 
> ---
> v3:
> * fixed double check. Thanks Daniel for noticing it.
> * updated patch description.
> 
> v2:
> * included Michael Ellerman's feedback.
> * added Daniel Axtens's Reviewed-by.
> 
>  arch/powerpc/kernel/secure_boot.c | 19 +--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/secure_boot.c 
> b/arch/powerpc/kernel/secure_boot.c
> index 4b982324d368..118bcb5f79c4 100644
> --- a/arch/powerpc/kernel/secure_boot.c
> +++ b/arch/powerpc/kernel/secure_boot.c
> @@ -6,6 +6,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static struct device_node *get_ppc_fw_sb_node(void)
>  {
> @@ -23,12 +24,19 @@ bool is_ppc_secureboot_enabled(void)
>  {
>   struct device_node *node;
>   bool enabled = false;
> + u32 secureboot;
>  
>   node = get_ppc_fw_sb_node();
>   enabled = of_property_read_bool(node, "os-secureboot-enforcing");
> -
>   of_node_put(node);
>  
> + if (enabled)
> + goto out;
> +
> + if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot))
> + enabled = (secureboot > 1);
> +
> +out:
>   pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled");
>  
>   return enabled;
> @@ -38,12 +46,19 @@ bool is_ppc_trustedboot_enabled(void)
>  {
>   struct device_node *node;
>   bool enabled = false;
> + u32 trustedboot;
>  
>   node = get_ppc_fw_sb_node();
>   enabled = of_property_read_bool(node, "trusted-enabled");
> -
>   of_node_put(node);
>  
> + if (enabled)
> + goto out;
> +
> + if (!of_property_read_u32(of_root, "ibm,trusted-boot", &trustedboot))
> + enabled = (trustedboot > 0);
> +
> +out:
>   pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled");
>  
>   return enabled;
> -- 
> 2.26.2
>

RE: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86

2020-07-16 Thread David Laight

From: Bjorn Helgaas
> Sent: 15 July 2020 23:02
> 
> On Wed, Jul 15, 2020 at 02:24:21PM +, David Laight wrote:
> > From: Arnd Bergmann
> > > Sent: 15 July 2020 07:47
> > > On Wed, Jul 15, 2020 at 1:46 AM Bjorn Helgaas  wrote:
> > >
> > >  So something like:
> > > >
> > > >   void pci_read_config_word(struct pci_dev *dev, int where, u16 *val)
> > > >
> > > > and where we used to return anything non-zero, we just set *val = ~0
> > > > instead?  I think we do that already in most, maybe all, cases.
> > >
> > > Right, this is what I had in mind. If we start by removing the handling
> > > of the return code in all files that clearly don't need it, looking at
> > > whatever remains will give a much better idea of what a good interface
> > > should be.
> >
> > It would be best to get rid of that nasty 'u16 *' parameter.
> 
> Do you mean nasty because it's basically a return value, but not
> returned as the *function's* return value?  I agree that if we were
> starting from scratch it would nicer to have:
> 
>   u16 pci_read_config_word(struct pci_dev *dev, int where)
> 
> but I don't think it's worth changing the thousands of callers just
> for that.

It'll shrink the kernel text size somewhat.
It could also be 'fixed' with a static inline.

Actually you don't even want the result to be u16.
Even though the domain of the value is 0..65535 keeping
the type as int (or unsigned int) will save the compiler
having to generate lots of masking instructions.

Code performance here will be overwhelmed by the time taken
for the config space access.
But more generally all local variables should really be
the size of cpu registers.

On x86-64 you need to use 'unsigned int' for anything used
as array subscripts to avoid the 'sign extend' instructions.
In some code paths it may matter...

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

[RFC PATCH] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

2020-07-16 Thread Ram Pai

An instruction accessing a mmio address, generates a HDSI fault.  This fault is
appropriately handled by the Hypervisor.  However in the case of secureVMs, the
fault is delivered to the ultravisor.

Unfortunately the Ultravisor has no correct-way to fetch the faulting
instruction. The PEF architecture does not allow Ultravisor to enable MMU
translation. Walking the two level page table to read the instruction can race
with other vcpus modifying the SVM's process scoped page table.

This problem can be correctly solved with some help from the kernel.

Capture the faulting instruction in SPRG0 register, before executing the
faulting instruction. This enables the ultravisor to easily procure the
faulting instruction and emulate it.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/io.h | 85 ++-
 1 file changed, 75 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 635969b..7ef663d 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define SIO_CONFIG_RA  0x398
 #define SIO_CONFIG_RD  0x399
@@ -105,34 +106,98 @@
 static inline u##size name(const volatile u##size __iomem *addr)   \
 {  \
u##size ret;\
-   __asm__ __volatile__("sync;"#insn" %0,%y1;twi 0,%0,0;isync" \
-   : "=r" (ret) : "Z" (*addr) : "memory"); \
+   if (is_secure_guest()) {\
+   __asm__ __volatile__("mfsprg0 %3;"  \
+   "lnia %2;"  \
+   "ld %2,12(%2);" \
+   "mtsprg0 %2;"   \
+   "sync;" \
+   #insn" %0,%y1;" \
+   "twi 0,%0,0;"   \
+   "isync;"\
+   "mtsprg0 %3"\
+   : "=r" (ret)\
+   : "Z" (*addr), "r" (0), "r" (0) \
+   : "memory");\
+   } else {\
+   __asm__ __volatile__("sync;"\
+   #insn" %0,%y1;" \
+   "twi 0,%0,0;"   \
+   "isync" \
+   : "=r" (ret) : "Z" (*addr) : "memory"); \
+   }   \
return ret; \
 }
 
 #define DEF_MMIO_OUT_X(name, size, insn)   \
 static inline void name(volatile u##size __iomem *addr, u##size val)   \
 {  \
-   __asm__ __volatile__("sync;"#insn" %1,%y0"  \
-   : "=Z" (*addr) : "r" (val) : "memory"); \
-   mmiowb_set_pending();   \
+   if (is_secure_guest()) {\
+   __asm__ __volatile__("mfsprg0 %3;"  \
+   "lnia %2;"  \
+   "ld %2,12(%2);" \
+   "mtsprg0 %2;"   \
+   "sync;" \
+   #insn" %1,%y0;" \
+   "mtsprg0 %3"\
+   : "=Z" (*addr)  \
+   : "r" (val), "r" (0), "r" (0)   \
+   : "memory");\
+   } else {\
+   __asm__ __volatile__("sync;"\
+   #insn" %1,%y0"  \
+   : "=Z" (*addr) : "r" (val) : "memory"); \
+   mmiowb_set_pending();   \
+   }   \
 }
 
 #define DEF_MMIO_IN_D(name, size, insn)\
 static inline u##size name(const volatile u##size __iomem *addr)

Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-16 Thread Peter Zijlstra

On Wed, Jul 15, 2020 at 10:18:20PM -0700, Andy Lutomirski wrote:
> > On Jul 15, 2020, at 9:15 PM, Nicholas Piggin  wrote:

> > CPU0 CPU1
> > 1. user stuff
> > a. membarrier()  2. enter kernel
> > b. read rq->curr 3. rq->curr switched to kthread
> > c. is kthread, skip IPI  4. switch_to kthread
> > d. return to user5. rq->curr switched to user thread
> > 6. switch_to user thread
> > 7. exit kernel
> > 8. more user stuff

> I find it hard to believe that this is x86 only. Why would thread
> switch imply core sync on any architecture?  Is x86 unique in having a
> stupid expensive core sync that is heavier than smp_mb()?

smp_mb() is nowhere near the most expensive barrier we have in Linux,
mb() might qualify, since that has some completion requirements since it
needs to serialize against external actors.

On x86_64 things are rather murky, we have:

LOCK prefix -- which implies smp_mb() before and after RmW
LFENCE -- which used to be rmb like, until Spectre, and now it
  is ISYNC like. Since ISYNC ensures an empty pipeline,
  it also implies all loads are retired (and therefore
  complete) it implies rmb.
MFENCE -- which is a memop completion barrier like, it makes
  sure all previously issued memops are complete.

if you read that carefully, you'll note you'll have to use LFENCE +
MFENCE to order against non-memops instructions.

But none of them imply dumping the instruction decoder caches, that only
happens on core serializing instructions like CR3 writes, IRET, CPUID
and a few others, I think we recently got a SERIALIZE instruction to add
to this list.

On ARM64 there's something a whole different set of barriers, and again
smp_mb() isn't nowhere near the top of the list. They have roughly 3
classes:

ISB -- instruction sync barrier
DMB(x) -- memory ordering in domain x
DSB(x) -- memory completion in domain x

And they have at least 3 domains (IIRC), system, outer, inner.

The ARM64 __switch_to() includes a dsb(sy), just like PowerPC used to
have a SYNC, but since PowerPC is rare for only having one rediculously
heavy serializing instruction, we got to re-use the smp_mb() early in
__schedule() instead, but ARM64 can't do that.

So rather than say that x86 is special here, I'd say that PowerPC is
special here.

> But I’m wondering if all this deferred sync stuff is wrong. In the
> brave new world of io_uring and such, perhaps kernel access matter
> too.  Heck, even:

IIRC the membarrier SYNC_CORE use-case is about user-space
self-modifying code.

Userspace re-uses a text address and needs to SYNC_CORE before it can be
sure the old text is forgotten. Nothing the kernel does matters there.

I suppose the manpage could be more clear there.

Re: [PATCH] pseries: Fix 64 bit logical memory block panic

2020-07-16 Thread Paul Mackerras

On Wed, Jul 15, 2020 at 06:12:25PM +0530, Aneesh Kumar K.V wrote:
> Anton Blanchard  writes:
> 
> > Booting with a 4GB LMB size causes us to panic:
> >
> >   qemu-system-ppc64: OS terminated: OS panic:
> >   Memory block size not suitable: 0x0
> >
> > Fix pseries_memory_block_size() to handle 64 bit LMBs.
> >
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Anton Blanchard 
> > ---
> >  arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> > b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > index 5ace2f9a277e..6574ac33e887 100644
> > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > @@ -27,7 +27,7 @@ static bool rtas_hp_event;
> >  unsigned long pseries_memory_block_size(void)
> >  {
> > struct device_node *np;
> > -   unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
> > +   uint64_t memblock_size = MIN_MEMORY_BLOCK_SIZE;
> > struct resource r;
> >  
> > np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
> 
> We need similar changes at more places?
> 
> modified   arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -85,7 +85,7 @@ extern unsigned int mmu_base_pid;
>  /*
>   * memory block size used with radix translation.
>   */
> -extern unsigned int __ro_after_init radix_mem_block_size;
> +extern unsigned long __ro_after_init radix_mem_block_size;
>  
>  #define PRTB_SIZE_SHIFT  (mmu_pid_bits + 4)
>  #define PRTB_ENTRIES (1ul << mmu_pid_bits)
> modified   arch/powerpc/include/asm/drmem.h
> @@ -21,7 +21,7 @@ struct drmem_lmb {
>  struct drmem_lmb_info {
>   struct drmem_lmb*lmbs;
>   int n_lmbs;
> - u32 lmb_size;
> + u64 lmb_size;
>  };
>  
>  extern struct drmem_lmb_info *drmem_info;
> modified   arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -34,7 +34,7 @@
>  
>  unsigned int mmu_pid_bits;
>  unsigned int mmu_base_pid;
> -unsigned int radix_mem_block_size __ro_after_init;
> +unsigned long radix_mem_block_size __ro_after_init;

These changes look fine.

>  static __ref void *early_alloc_pgtable(unsigned long size, int nid,
>   unsigned long region_start, unsigned long region_end)
> modified   arch/powerpc/mm/drmem.c
> @@ -268,14 +268,15 @@ static void __init __walk_drmem_v2_lmbs(const __be32 
> *prop, const __be32 *usm,
>  void __init walk_drmem_lmbs_early(unsigned long node,
>   void (*func)(struct drmem_lmb *, const __be32 **))
>  {
> + const __be64 *lmb_prop;
>   const __be32 *prop, *usm;
>   int len;
>  
> - prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
> - if (!prop || len < dt_root_size_cells * sizeof(__be32))
> + lmb_prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
> + if (!lmb_prop || len < sizeof(__be64))
>   return;
>  
> - drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
> + drmem_info->lmb_size = be64_to_cpup(lmb_prop);

This particular change shouldn't be necessary.  We already have
dt_mem_next_cell() returning u64, and it knows how to combine two
cells to give a u64 (for dt_root_size_cells == 2).

>   usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len);
>  
> @@ -296,19 +297,19 @@ void __init walk_drmem_lmbs_early(unsigned long node,
>  
>  static int __init init_drmem_lmb_size(struct device_node *dn)
>  {
> - const __be32 *prop;
> + const __be64 *prop;
>   int len;
>  
>   if (drmem_info->lmb_size)
>   return 0;
>  
>   prop = of_get_property(dn, "ibm,lmb-size", &len);
> - if (!prop || len < dt_root_size_cells * sizeof(__be32)) {
> + if (!prop || len < sizeof(__be64)) {
>   pr_info("Could not determine LMB size\n");
>   return -1;
>   }
>  
> - drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
> + drmem_info->lmb_size = be64_to_cpup(prop);

Same comment here.

Paul.

Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-16 Thread Nicholas Piggin

Excerpts from Peter Zijlstra's message of July 16, 2020 6:50 pm:
> On Wed, Jul 15, 2020 at 10:18:20PM -0700, Andy Lutomirski wrote:
>> > On Jul 15, 2020, at 9:15 PM, Nicholas Piggin  wrote:
> 
>> > CPU0 CPU1
>> > 1. user stuff
>> > a. membarrier()  2. enter kernel
>> > b. read rq->curr 3. rq->curr switched to kthread
>> > c. is kthread, skip IPI  4. switch_to kthread
>> > d. return to user5. rq->curr switched to user thread
>> > 6. switch_to user thread
>> > 7. exit kernel
>> > 8. more user stuff
> 
>> I find it hard to believe that this is x86 only. Why would thread
>> switch imply core sync on any architecture?  Is x86 unique in having a
>> stupid expensive core sync that is heavier than smp_mb()?
> 
> smp_mb() is nowhere near the most expensive barrier we have in Linux,
> mb() might qualify, since that has some completion requirements since it
> needs to serialize against external actors.
> 
> On x86_64 things are rather murky, we have:
> 
>   LOCK prefix -- which implies smp_mb() before and after RmW
>   LFENCE -- which used to be rmb like, until Spectre, and now it
> is ISYNC like. Since ISYNC ensures an empty pipeline,
> it also implies all loads are retired (and therefore
> complete) it implies rmb.
>   MFENCE -- which is a memop completion barrier like, it makes
> sure all previously issued memops are complete.
> 
> if you read that carefully, you'll note you'll have to use LFENCE +
> MFENCE to order against non-memops instructions.
> 
> But none of them imply dumping the instruction decoder caches, that only
> happens on core serializing instructions like CR3 writes, IRET, CPUID
> and a few others, I think we recently got a SERIALIZE instruction to add
> to this list.
> 
> 
> On ARM64 there's something a whole different set of barriers, and again
> smp_mb() isn't nowhere near the top of the list. They have roughly 3
> classes:
> 
>   ISB -- instruction sync barrier
>   DMB(x) -- memory ordering in domain x
>   DSB(x) -- memory completion in domain x
> 
> And they have at least 3 domains (IIRC), system, outer, inner.
> 
> The ARM64 __switch_to() includes a dsb(sy), just like PowerPC used to
> have a SYNC, but since PowerPC is rare for only having one rediculously
> heavy serializing instruction, we got to re-use the smp_mb() early in
> __schedule() instead, but ARM64 can't do that.
> 
> 
> So rather than say that x86 is special here, I'd say that PowerPC is
> special here.

PowerPC is "special", I'll agree with you there :)

It does have a SYNC (HWSYNC) instruction that is mb(). It does not
serialize the core.

ISYNC is a nop. ICBI ; ISYNC does serialize the core.

Difference between them is probably much the same as difference between
MFENCE and CPUID on x86 CPUs. Serializing the core is almost always 
pretty expensive. HWSYNC/MFENCE can be expensive if you have a lot of
or difficult (not exclusive in cache) outstanding with critical reads
after the barrier, but it can also be somewhat cheap if there are few
writes, and executed past, it only needs to hold up subsequent reads.

That said... implementation details. powerpc CPUs have traditionally
had fairly costly HWSYNC.


>> But I’m wondering if all this deferred sync stuff is wrong. In the
>> brave new world of io_uring and such, perhaps kernel access matter
>> too.  Heck, even:
> 
> IIRC the membarrier SYNC_CORE use-case is about user-space
> self-modifying code.
> 
> Userspace re-uses a text address and needs to SYNC_CORE before it can be
> sure the old text is forgotten. Nothing the kernel does matters there.
> 
> I suppose the manpage could be more clear there.

True, but memory ordering of kernel stores from kernel threads for
regular mem barrier is the concern here.

Does io_uring update completion queue from kernel thread or interrupt,
for example? If it does, then membarrier will not order such stores
with user memory accesses.

Thanks,
Nick

[PATCH -next] powerpc: Convert to DEFINE_SHOW_ATTRIBUTE

2020-07-16 Thread Qinglang Miao

From: Chen Huang 

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Chen Huang 
---
 arch/powerpc/kvm/book3s_xive_native.c  | 12 +---
 arch/powerpc/mm/ptdump/bats.c  | 24 +++-
 arch/powerpc/mm/ptdump/hashpagetable.c | 12 +---
 arch/powerpc/mm/ptdump/ptdump.c| 13 +
 arch/powerpc/mm/ptdump/segment_regs.c  | 12 +---
 5 files changed, 11 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
b/arch/powerpc/kvm/book3s_xive_native.c
index 02e3cbbea..d0c2db0e0 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, 
void *private)
return 0;
 }
 
-static int xive_native_debug_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, xive_native_debug_show, inode->i_private);
-}
-
-static const struct file_operations xive_native_debug_fops = {
-   .open = xive_native_debug_open,
-   .read_iter = seq_read_iter,
-   .llseek = seq_lseek,
-   .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xive_native_debug);
 
 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
 {
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index 7afcdac48..93771af72 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -56,7 +56,7 @@ static void bat_show_601(struct seq_file *m, int idx, u32 
lower, u32 upper)
 
 #define BAT_SHOW_601(_m, _n, _l, _u) bat_show_601(_m, _n, mfspr(_l), mfspr(_u))
 
-static int bats_show_601(struct seq_file *m, void *v)
+static int bats_601_show(struct seq_file *m, void *v)
 {
seq_puts(m, "---[ Block Address Translation ]---\n");
 
@@ -113,7 +113,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 
lower, u32 upper, bool
 
 #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), 
mfspr(_u), _d)
 
-static int bats_show_603(struct seq_file *m, void *v)
+static int bats_603_show(struct seq_file *m, void *v)
 {
seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
 
@@ -144,25 +144,15 @@ static int bats_show_603(struct seq_file *m, void *v)
return 0;
 }
 
-static int bats_open(struct inode *inode, struct file *file)
-{
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
-   return single_open(file, bats_show_601, NULL);
-
-   return single_open(file, bats_show_603, NULL);
-}
-
-static const struct file_operations bats_fops = {
-   .open   = bats_open,
-   .read_iter  = seq_read_iter,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bats_601);
+DEFINE_SHOW_ATTRIBUTE(bats_603);
 
 static int __init bats_init(void)
 {
debugfs_create_file("block_address_translation", 0400,
-   powerpc_debugfs_root, NULL, &bats_fops);
+   powerpc_debugfs_root, NULL,
+   IS_ENABLED(CONFIG_PPC_BOOK3S_601) ?
+   &bats_601_fops : &bats_603_fops);
return 0;
 }
 device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c 
b/arch/powerpc/mm/ptdump/hashpagetable.c
index 457fcee7e..c7f824d29 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read_iter  = seq_read_iter,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static int ptdump_init(void)
 {
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index db17e84b5..58b062f1b 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -398,18 +398,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read_iter  = seq_read_iter,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static void build_pgtable_complete_mask(void)
 {
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c 
b/arch/powerpc/mm/ptdump/segment_regs.c
index 8b15bad5a..9e870d44c 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int sr_open(stru

Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-16 Thread peterz

On Thu, Jul 16, 2020 at 08:03:36PM +1000, Nicholas Piggin wrote:
> Excerpts from Peter Zijlstra's message of July 16, 2020 6:50 pm:
> > On Wed, Jul 15, 2020 at 10:18:20PM -0700, Andy Lutomirski wrote:
> >> > On Jul 15, 2020, at 9:15 PM, Nicholas Piggin  wrote:

> >> But I’m wondering if all this deferred sync stuff is wrong. In the
> >> brave new world of io_uring and such, perhaps kernel access matter
> >> too.  Heck, even:
> > 
> > IIRC the membarrier SYNC_CORE use-case is about user-space
> > self-modifying code.
> > 
> > Userspace re-uses a text address and needs to SYNC_CORE before it can be
> > sure the old text is forgotten. Nothing the kernel does matters there.
> > 
> > I suppose the manpage could be more clear there.
> 
> True, but memory ordering of kernel stores from kernel threads for
> regular mem barrier is the concern here.
> 
> Does io_uring update completion queue from kernel thread or interrupt,
> for example? If it does, then membarrier will not order such stores
> with user memory accesses.

So we're talking about regular membarrier() then? Not the SYNC_CORE
variant per-se.

Even there, I'll argue we don't care, but perhaps Mathieu has a
different opinion. All we care about is that all other threads (or CPUs
for GLOBAL) observe an smp_mb() before it returns.

Any serialization against whatever those other threads/CPUs are running
at the instant of the syscall is external to the syscall, we make no
gauarantees about that. That is, we can fundamentally not say what
another CPU is executing concurrently. Nor should we want to.

So if you feel that your membarrier() ought to serialize against remote
execution, you need to arrange a quiecent state on the remote side
yourself.

Now, normally membarrier() is used to implement userspace RCU like
things, and there all that matters is that the remote CPUs observe the
beginngin of the new grace-period, ie counter flip, and we observe their
read-side critical sections, or smething like that, it's been a while
since I looked at all that.

It's always been the case that concurrent syscalls could change user
memory, io_uring doesn't change that, it just makes it even less well
defined when that would happen. If you want to serialize against that,
you need to arrange that externally.

[PATCH 0/5] Improvements to pkey tests

2020-07-16 Thread Sandipan Das

Based on recent bugs found in the pkey infrastructure, this
improves the test for execute-disabled pkeys and adds a new
test for detecting inconsistencies with the pkey reported by
the signal information upon getting a fault.

Sandipan Das (5):
  selftests/powerpc: Move pkey helpers to headers
  selftests/powerpc: Add pkey helpers for rights
  selftests/powerpc: Harden test for execute-disabled pkeys
  selftests/powerpc: Add helper to exit on failure
  selftests/powerpc: Add test for pkey siginfo verification

 .../testing/selftests/powerpc/include/pkeys.h | 136 +++
 .../testing/selftests/powerpc/include/utils.h |  13 +
 tools/testing/selftests/powerpc/mm/.gitignore |   1 +
 tools/testing/selftests/powerpc/mm/Makefile   |   5 +-
 .../selftests/powerpc/mm/pkey_exec_prot.c | 210 +++
 .../selftests/powerpc/mm/pkey_siginfo.c   | 332 ++
 6 files changed, 544 insertions(+), 153 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/include/pkeys.h
 create mode 100644 tools/testing/selftests/powerpc/mm/pkey_siginfo.c

-- 
2.25.1

[PATCH 1/5] selftests/powerpc: Move pkey helpers to headers

2020-07-16 Thread Sandipan Das

This moves all the pkey-related helpers to a new header
file and also a helper to print error messages in signal
handlers to the existing utils header file.

Signed-off-by: Sandipan Das 
---
 .../testing/selftests/powerpc/include/pkeys.h | 108 ++
 .../testing/selftests/powerpc/include/utils.h |   4 +
 .../selftests/powerpc/mm/pkey_exec_prot.c | 100 +---
 3 files changed, 114 insertions(+), 98 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/include/pkeys.h

diff --git a/tools/testing/selftests/powerpc/include/pkeys.h 
b/tools/testing/selftests/powerpc/include/pkeys.h
new file mode 100644
index 0..9b53a97e664ea
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/pkeys.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_PKEYS_H
+#define _SELFTESTS_POWERPC_PKEYS_H
+
+#include 
+
+#include "reg.h"
+#include "utils.h"
+
+/*
+ * Older versions of libc use the Intel-specific access rights.
+ * Hence, override the definitions as they might be incorrect.
+ */
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS0x3
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#undef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE   0x4
+
+/* Older versions of libc do not not define this */
+#ifndef SEGV_PKUERR
+#define SEGV_PKUERR4
+#endif
+
+#define SI_PKEY_OFFSET 0x20
+
+#define SYS_pkey_mprotect  386
+#define SYS_pkey_alloc 384
+#define SYS_pkey_free  385
+
+#define PKEY_BITS_PER_PKEY 2
+#define NR_PKEYS   32
+#define PKEY_BITS_MASK ((1UL << PKEY_BITS_PER_PKEY) - 1)
+
+inline unsigned long pkeyreg_get(void)
+{
+   return mfspr(SPRN_AMR);
+}
+
+inline void pkeyreg_set(unsigned long amr)
+{
+   set_amr(amr);
+}
+
+void pkey_set_rights(int pkey, unsigned long rights)
+{
+   unsigned long amr, shift;
+
+   shift = (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+   amr = pkeyreg_get();
+   amr &= ~(PKEY_BITS_MASK << shift);
+   amr |= (rights & PKEY_BITS_MASK) << shift;
+   pkeyreg_set(amr);
+}
+
+int sys_pkey_mprotect(void *addr, size_t len, int prot, int pkey)
+{
+   return syscall(SYS_pkey_mprotect, addr, len, prot, pkey);
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long rights)
+{
+   return syscall(SYS_pkey_alloc, flags, rights);
+}
+
+int sys_pkey_free(int pkey)
+{
+   return syscall(SYS_pkey_free, pkey);
+}
+
+int pkeys_unsupported(void)
+{
+   bool hash_mmu = false;
+   int pkey;
+
+   /* Protection keys are currently supported on Hash MMU only */
+   FAIL_IF(using_hash_mmu(&hash_mmu));
+   SKIP_IF(!hash_mmu);
+
+   /* Check if the system call is supported */
+   pkey = sys_pkey_alloc(0, 0);
+   SKIP_IF(pkey < 0);
+   sys_pkey_free(pkey);
+
+   return 0;
+}
+
+int siginfo_pkey(siginfo_t *si)
+{
+   /*
+* In older versions of libc, siginfo_t does not have si_pkey as
+* a member.
+*/
+#ifdef si_pkey
+   return si->si_pkey;
+#else
+   return *((int *)(((char *) si) + SI_PKEY_OFFSET));
+#endif
+}
+
+#endif /* _SELFTESTS_POWERPC_PKEYS_H */
diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index 9dbe607cc5ec3..7f259f36e23bc 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -97,6 +97,10 @@ do { 
\
 #define _str(s) #s
 #define str(s) _str(s)
 
+#define sigsafe_err(msg)   ({ \
+   ssize_t nbytes __attribute__((unused)); \
+   nbytes = write(STDERR_FILENO, msg, strlen(msg)); })
+
 /* POWER9 feature */
 #ifndef PPC_FEATURE2_ARCH_3_00
 #define PPC_FEATURE2_ARCH_3_00 0x0080
diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
index 7c7c93425c5e9..1253ad6afba24 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -14,83 +14,13 @@
 #include 
 
 #include 
-#include 
 
-#include "reg.h"
-#include "utils.h"
-
-/*
- * Older versions of libc use the Intel-specific access rights.
- * Hence, override the definitions as they might be incorrect.
- */
-#undef PKEY_DISABLE_ACCESS
-#define PKEY_DISABLE_ACCESS0x3
-
-#undef PKEY_DISABLE_WRITE
-#define PKEY_DISABLE_WRITE 0x2
-
-#undef PKEY_DISABLE_EXECUTE
-#define PKEY_DISABLE_EXECUTE   0x4
-
-/* Older versions of libc do not not define this */
-#ifndef SEGV_PKUERR
-#define SEGV_PKUERR4
-#endif
-
-#define SI_PKEY_OFFSET 0x20
-
-#define SYS_pkey_mprotect  386
-#define SYS_pkey_alloc 384
-#define SYS_pkey_free  385
-
-#define PKEY_BITS_PER_PKEY 2
-#define NR_PKEYS   32
-#define PKEY_BITS_MASK ((1UL << PKEY_BITS_PER_PKEY) - 1)
+#include "pke

[PATCH 2/5] selftests/powerpc: Add pkey helpers for rights

2020-07-16 Thread Sandipan Das

This adds some new pkey-related helper to print
access rights of a pkey in the "rwx" format and
to generate different valid combinations of pkey
rights starting from a given combination.

Signed-off-by: Sandipan Das 
---
 .../testing/selftests/powerpc/include/pkeys.h | 28 +++
 .../selftests/powerpc/mm/pkey_exec_prot.c | 36 ++-
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/powerpc/include/pkeys.h 
b/tools/testing/selftests/powerpc/include/pkeys.h
index 9b53a97e664ea..6ba95039a0343 100644
--- a/tools/testing/selftests/powerpc/include/pkeys.h
+++ b/tools/testing/selftests/powerpc/include/pkeys.h
@@ -105,4 +105,32 @@ int siginfo_pkey(siginfo_t *si)
 #endif
 }
 
+#define pkey_rights(r) ({  \
+   static char buf[4] = "rwx"; \
+   unsigned int amr_bits;  \
+   if ((r) & PKEY_DISABLE_EXECUTE) \
+   buf[2] = '-';   \
+   amr_bits = (r) & PKEY_BITS_MASK;\
+   if (amr_bits & PKEY_DISABLE_WRITE)  \
+   buf[1] = '-';   \
+   if (amr_bits & PKEY_DISABLE_ACCESS & ~PKEY_DISABLE_WRITE)   \
+   buf[0] = '-';   \
+   buf;\
+})
+
+unsigned long next_pkey_rights(unsigned long rights)
+{
+   if (rights == PKEY_DISABLE_ACCESS)
+   return PKEY_DISABLE_EXECUTE;
+   else if (rights == (PKEY_DISABLE_ACCESS | PKEY_DISABLE_EXECUTE))
+   return 0;
+
+   if ((rights & PKEY_BITS_MASK) == 0)
+   rights |= PKEY_DISABLE_WRITE;
+   else if ((rights & PKEY_BITS_MASK) == PKEY_DISABLE_WRITE)
+   rights |= PKEY_DISABLE_ACCESS;
+
+   return rights;
+}
+
 #endif /* _SELFTESTS_POWERPC_PKEYS_H */
diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
index 1253ad6afba24..18ebfe6bae1c9 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -102,6 +102,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void 
*ctx)
 static int test(void)
 {
struct sigaction segv_act, trap_act;
+   unsigned long rights;
int pkey, ret, i;
 
ret = pkeys_unsupported();
@@ -150,7 +151,8 @@ static int test(void)
insns[numinsns - 1] = PPC_INST_BLR;
 
/* Allocate a pkey that restricts execution */
-   pkey = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE);
+   rights = PKEY_DISABLE_EXECUTE;
+   pkey = sys_pkey_alloc(0, rights);
FAIL_IF(pkey < 0);
 
/*
@@ -175,8 +177,8 @@ static int test(void)
 */
remaining_faults = 0;
FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   printf("read from %p, pkey is execute-disabled, access-enabled\n",
-  (void *) fault_addr);
+   printf("read from %p, pkey permissions are %s\n", fault_addr,
+  pkey_rights(rights));
i = *fault_addr;
FAIL_IF(remaining_faults != 0);
 
@@ -192,12 +194,13 @@ static int test(void)
 */
remaining_faults = 1;
FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   printf("write to %p, pkey is execute-disabled, access-enabled\n",
-  (void *) fault_addr);
+   printf("write to %p, pkey permissions are %s\n", fault_addr,
+  pkey_rights(rights));
*fault_addr = PPC_INST_TRAP;
FAIL_IF(remaining_faults != 0 || fault_code != SEGV_ACCERR);
 
/* The following three cases will generate SEGV_PKUERR */
+   rights |= PKEY_DISABLE_ACCESS;
fault_type = PKEY_DISABLE_ACCESS;
fault_pkey = pkey;
 
@@ -211,9 +214,9 @@ static int test(void)
 */
remaining_faults = 1;
FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   printf("read from %p, pkey is execute-disabled, access-disabled\n",
-  (void *) fault_addr);
-   pkey_set_rights(pkey, PKEY_DISABLE_ACCESS);
+   pkey_set_rights(pkey, rights);
+   printf("read from %p, pkey permissions are %s\n", fault_addr,
+  pkey_rights(rights));
i = *fault_addr;
FAIL_IF(remaining_faults != 0 || fault_code != SEGV_PKUERR);
 
@@ -228,9 +231,9 @@ static int test(void)
 */
remaining_faults = 2;
FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   printf("write to %p, pkey is execute-disabled, access-disabled\n",
-  (void *) fault_addr);
-   pkey_set_rights(pkey, PKEY_DISABLE_ACCESS);
+   pkey_set_rights(pkey, rights);
+   printf("write to %p, pkey permiss

[PATCH 3/5] selftests/powerpc: Harden test for execute-disabled pkeys

2020-07-16 Thread Sandipan Das

Commit 192b6a7805989 ("powerpc/book3s64/pkeys: Fix
pkey_access_permitted() for execute disable pkey") fixed a
bug that caused repetitive faults for pkeys with no execute
rights alongside some combination of read and write rights.

This removes the last two cases of the test, which check
the behaviour of pkeys with read, write but no execute
rights and all the rights, in favour of checking all the
possible combinations of read, write and execute rights
to be able to detect bugs like the one mentioned above.

Signed-off-by: Sandipan Das 
---
 .../selftests/powerpc/mm/pkey_exec_prot.c | 84 +--
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
index 18ebfe6bae1c9..9e5c7f3f498a7 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -237,55 +237,53 @@ static int test(void)
*fault_addr = PPC_INST_NOP;
FAIL_IF(remaining_faults != 0 || fault_code != SEGV_ACCERR);
 
-   /*
-* Jump to the executable region when AMR bits are set i.e.
-* the pkey permits neither read nor write access.
-*
-* This should generate a pkey fault based on IAMR bits which
-* are set to not permit execution. AMR bits should not affect
-* execution.
-*
-* This also checks if the overwrite of the first instruction
-* word from a trap to a no-op succeeded.
-*/
-   fault_addr = insns;
-   fault_type = PKEY_DISABLE_EXECUTE;
-   fault_pkey = pkey;
-   remaining_faults = 1;
-   FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   pkey_set_rights(pkey, PKEY_DISABLE_ACCESS);
-   printf("execute at %p, pkey permissions are %s\n", fault_addr,
-  pkey_rights(rights));
-   asm volatile("mtctr %0; bctrl" : : "r"(insns));
-   FAIL_IF(remaining_faults != 0 || fault_code != SEGV_PKUERR);
-
-   /*
-* Free the current pkey and allocate a new one that is
-* fully permissive.
-*/
+   /* Free the current pkey */
sys_pkey_free(pkey);
+
rights = 0;
-   pkey = sys_pkey_alloc(0, rights);
+   do {
+   /*
+* Allocate pkeys with all valid combinations of read,
+* write and execute restrictions.
+*/
+   pkey = sys_pkey_alloc(0, rights);
+   FAIL_IF(pkey < 0);
+
+   /*
+* Jump to the executable region. AMR bits may or may not
+* be set but they should not affect execution.
+*
+* This should generate pkey faults based on IAMR bits which
+* may be set to restrict execution.
+*
+* The first iteration also checks if the overwrite of the
+* first instruction word from a trap to a no-op succeeded.
+*/
+   fault_pkey = pkey;
+   fault_type = -1;
+   remaining_faults = 0;
+   if (rights & PKEY_DISABLE_EXECUTE) {
+   fault_type = PKEY_DISABLE_EXECUTE;
+   remaining_faults = 1;
+   }
 
-   /*
-* Jump to the executable region when AMR bits are not set
-* i.e. the pkey permits read and write access.
-*
-* This should not generate any faults as the IAMR bits are
-* also not set and hence will the pkey will not restrict
-* execution.
-*/
-   fault_pkey = pkey;
-   remaining_faults = 0;
-   FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
-   printf("execute at %p, pkey permissions are %s\n", fault_addr,
-  pkey_rights(rights));
-   asm volatile("mtctr %0; bctrl" : : "r"(insns));
-   FAIL_IF(remaining_faults != 0);
+   FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+   printf("execute at %p, pkey permissions are %s\n", fault_addr,
+  pkey_rights(rights));
+   asm volatile("mtctr %0; bctrl" : : "r"(insns));
+   FAIL_IF(remaining_faults != 0);
+   if (rights & PKEY_DISABLE_EXECUTE)
+   FAIL_IF(fault_code != SEGV_PKUERR);
+
+   /* Free the current pkey */
+   sys_pkey_free(pkey);
+
+   /* Find next valid combination of pkey rights */
+   rights = next_pkey_rights(rights);
+   } while (rights);
 
/* Cleanup */
munmap((void *) insns, pgsize);
-   sys_pkey_free(pkey);
 
return 0;
 }
-- 
2.25.1

[PATCH 4/5] selftests/powerpc: Add helper to exit on failure

2020-07-16 Thread Sandipan Das

This adds a helper similar to FAIL_IF() which lets a
program exit with code 1 (to indicate failure) when
the given condition is true.

Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/powerpc/include/utils.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index 7f259f36e23bc..69d16875802da 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -72,6 +72,15 @@ do { 
\
}   \
 } while (0)
 
+#define FAIL_IF_EXIT(x)\
+do {   \
+   if ((x)) {  \
+   fprintf(stderr, \
+   "[FAIL] Test FAILED on line %d\n", __LINE__);   \
+   _exit(1);   \
+   }   \
+} while (0)
+
 /* The test harness uses this, yes it's gross */
 #define MAGIC_SKIP_RETURN_VALUE99
 
-- 
2.25.1

[PATCH 5/5] selftests/powerpc: Add test for pkey siginfo verification

2020-07-16 Thread Sandipan Das

Commit c46241a370a61 ("powerpc/pkeys: Check vma before
returning key fault error to the user") fixes a bug which
causes the kernel to set the wrong pkey in siginfo when a
pkey fault occurs after two competing threads that have
allocated different pkeys, one fully permissive and the
other restrictive, attempt to protect a common page at the
same time. This adds a test to detect the bug.

Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/powerpc/mm/.gitignore |   1 +
 tools/testing/selftests/powerpc/mm/Makefile   |   5 +-
 .../selftests/powerpc/mm/pkey_siginfo.c   | 332 ++
 3 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/pkey_siginfo.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore 
b/tools/testing/selftests/powerpc/mm/.gitignore
index 8f841f925baa5..36ec2c4ccdea4 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -9,3 +9,4 @@ large_vm_fork_separation
 bad_accesses
 tlbie_test
 pkey_exec_prot
+pkey_siginfo
diff --git a/tools/testing/selftests/powerpc/mm/Makefile 
b/tools/testing/selftests/powerpc/mm/Makefile
index f9fa0ba7435c4..558b7ccc93932 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -3,7 +3,8 @@ noarg:
$(MAKE) -C ../
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors 
wild_bctr \
- large_vm_fork_separation bad_accesses pkey_exec_prot
+ large_vm_fork_separation bad_accesses pkey_exec_prot \
+ pkey_siginfo
 TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
@@ -18,8 +19,10 @@ $(OUTPUT)/wild_bctr: CFLAGS += -m64
 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/bad_accesses: CFLAGS += -m64
 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
+$(OUTPUT)/pkey_siginfo: CFLAGS += -m64
 
 $(OUTPUT)/tempfile:
dd if=/dev/zero of=$@ bs=64k count=1
 
 $(OUTPUT)/tlbie_test: LDLIBS += -lpthread
+$(OUTPUT)/pkey_siginfo: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c 
b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
new file mode 100644
index 0..58605c53d495d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ *
+ * Test if the signal information reports the correct memory protection
+ * key upon getting a key access violation fault for a page that was
+ * attempted to be protected by two different keys from two competing
+ * threads at the same time.
+ */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include "pkeys.h"
+
+#define PPC_INST_NOP   0x6000
+#define PPC_INST_BLR   0x4e800020
+#define PROT_RWX   (PROT_READ | PROT_WRITE | PROT_EXEC)
+
+#define NUM_ITERATIONS 100
+
+static volatile sig_atomic_t perm_pkey, rest_pkey;
+static volatile sig_atomic_t rights, fault_count;
+static volatile unsigned int *volatile fault_addr;
+static pthread_barrier_t iteration_barrier;
+
+static void segv_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+   void *pgstart;
+   size_t pgsize;
+   int pkey;
+
+   pkey = siginfo_pkey(sinfo);
+
+   /* Check if this fault originated from a pkey access violation */
+   if (sinfo->si_code != SEGV_PKUERR) {
+   sigsafe_err("got a fault for an unexpected reason\n");
+   _exit(1);
+   }
+
+   /* Check if this fault originated from the expected address */
+   if (sinfo->si_addr != (void *) fault_addr) {
+   sigsafe_err("got a fault for an unexpected address\n");
+   _exit(1);
+   }
+
+   /* Check if this fault originated from the restrictive pkey */
+   if (pkey != rest_pkey) {
+   sigsafe_err("got a fault for an unexpected pkey\n");
+   _exit(1);
+   }
+
+   /* Check if too many faults have occurred for the same iteration */
+   if (fault_count > 0) {
+   sigsafe_err("got too many faults for the same address\n");
+   _exit(1);
+   }
+
+   pgsize = getpagesize();
+   pgstart = (void *) ((unsigned long) fault_addr & ~(pgsize - 1));
+
+   /*
+* If the current fault occurred due to lack of execute rights,
+* reassociate the page with the exec-only pkey since execute
+* rights cannot be changed directly for the faulting pkey as
+* IAMR is inaccessible from userspace.
+*
+* Otherwise, if the current fault occurred due to lack of
+* read-write rights, change the AMR permission bits for the
+* pkey.
+*
+* This will let the test continue.
+*/
+   if (rights == PKEY_DISABLE_EXECUTE &&
+   mprotect(pgstart, pgsize, PROT_EXEC))
+   _exit(1);
+   else
+

1 2 >

1 - 100 of 135 matches

Mail list logo