[PATCH 7/9] powerpc/pseries/eeh: Rework device EEH PE determination

2020-09-09 Thread Oliver O'Halloran
The process Linux uses for determining if a device supports EEH or not
appears to be at odds with what PAPR+ says the OS should be doing. The
current flow is something like:

1. Assume pe_config_addr is equal the the device's config_addr.
2. Attempt to enable EEH on that PE
3. Verify EEH was enabled (POWER4 bug workaround)
4. Try find the pe_config_addr using the ibm,get-config-addr-info2 RTAS
   call.
5. If that fails walk the pci_dn tree upwards trying to find a parent
   device with EEH support. If we find one then add the device to that PE.

The first major flaw with this is that in order to enable EEH on a PE we
need to know the PE's configuration address since that's an input to the
ibm,set-eeh-option RTAS call which is used to enable EEH for the PE. We
hack around that by assuming that the PE address is equal to the device's
RTAS config address with the register fields set to zero (see
rtas_config_addr()). This assumption happens to be valid if:

a) The PCI device is the 0th function, and
b) The device is on the PE's root bus.

However, it this does also appear to work for devices where these
conditions are not true. At a guess PowerVM's RTAS has some workarounds to
accommodate Linux's quirks. However, it's a bit sketch and the code is
confusing since it's not implementing what PAPR claims is the correct way.

This patch re-works how we handle EEH init so that we find the PE config
address using the ibm,get-config-addr-info2 RTAS call, then use that to
finish the EEH init process. It also drops the Power4 workaround since as
of commit 471d7ff8b51b ("powerpc/64s: Remove POWER4 support") the kernel
does not support running on a Power4 CPU.

1. Find the pe_config_addr using the RTAS call.
2. Enable the PE (if needed)
3. Insert the edev into the tree and create an eeh_pe if needed.

The other change made here is ignoring unsupported devices entirely.
Currently the device's BARs are saved to the eeh_dev even if the device is
not part of an EEH PE. Not being part of a PE means that an EEH recovery
pass will never see that device so the saving the BARs is pointless.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 57 
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 10303de3d8d5..c2ecc0db2f94 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -357,10 +357,10 @@ static struct eeh_pe *pseries_eeh_pe_get_parent(struct 
eeh_dev *edev)
  */
 void pseries_eeh_init_edev(struct pci_dn *pdn)
 {
+   struct eeh_pe pe, *parent;
struct eeh_dev *edev;
-   struct eeh_pe pe;
+   int addr;
u32 pcie_flags;
-   int enable = 0;
int ret;
 
if (WARN_ON_ONCE(!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)))
@@ -417,51 +417,38 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
}
}
 
-   /* Initialize the fake PE */
+   /* first up, find the pe_config_addr for the PE containing the device */
+   addr = pseries_eeh_get_pe_config_addr(pdn);
+   if (addr == 0) {
+   eeh_edev_dbg(edev, "Unable to find pe_config_addr\n");
+   goto err;
+   }
+
+   /* Try enable EEH on the fake PE */
memset(, 0, sizeof(struct eeh_pe));
pe.phb = pdn->phb;
-   pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+   pe.addr = addr;
 
-   /* Enable EEH on the device */
eeh_edev_dbg(edev, "Enabling EEH on device\n");
ret = eeh_ops->set_option(, EEH_OPT_ENABLE);
if (ret) {
eeh_edev_dbg(edev, "EEH failed to enable on device (code 
%d)\n", ret);
-   } else {
-   struct eeh_pe *parent;
+   goto err;
+   }
 
-   /* Retrieve PE address */
-   edev->pe_config_addr = pseries_eeh_get_pe_config_addr(pdn);
-   pe.addr = edev->pe_config_addr;
+   edev->pe_config_addr = addr;
 
-   /* Some older systems (Power4) allow the ibm,set-eeh-option
-* call to succeed even on nodes where EEH is not supported.
-* Verify support explicitly.
-*/
-   ret = eeh_ops->get_state(, NULL);
-   if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
-   enable = 1;
+   eeh_add_flag(EEH_ENABLED);
 
-   /*
-* This device doesn't support EEH, but it may have an
-* EEH parent. In this case any error on the device will
-* freeze the PE of it's upstream bridge, so added it to
-* the upstream PE.
-*/
-   parent = pseries_eeh_pe_get_parent(edev);
-   if (parent && !enable)
-   edev->pe_config_addr = parent->addr;
+   parent = pseries_eeh_pe_get_parent(edev);
+   

[PATCH 6/9] powerpc/pseries/eeh: Clean up pe_config_addr lookups

2020-09-09 Thread Oliver O'Halloran
De-duplicate, and fix up the comments, and make the prototype just take a
pci_dn since the job of the function is to return the pe_config_addr of the
PE which contains a given device.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 80 +++-
 1 file changed, 11 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index b1561961c7ff..10303de3d8d5 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -33,8 +33,6 @@
 #include 
 #include 
 
-static int pseries_eeh_get_pe_addr(struct pci_dn *pdn);
-
 /* RTAS tokens */
 static int ibm_set_eeh_option;
 static int ibm_set_slot_reset;
@@ -86,7 +84,8 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 
 
 /**
- * pseries_eeh_get_config_addr - Retrieve config address
+ * pseries_eeh_get_pe_config_addr - Find the pe_config_addr for a device
+ * @pdn: pci_dn of the input device
  *
  * Retrieve the assocated config address. Actually, there're 2 RTAS
  * function calls dedicated for the purpose. We need implement
@@ -97,16 +96,17 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
  * It's notable that zero'ed return value means invalid PE config
  * address.
  */
-static int pseries_eeh_get_config_addr(struct pci_controller *phb, int 
config_addr)
+static int pseries_eeh_get_pe_config_addr(struct pci_dn *pdn)
 {
+   int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+   struct pci_controller *phb = pdn->phb;
int ret = 0;
int rets[3];
 
if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
/*
-* First of all, we need to make sure there has one PE
-* associated with the device. Otherwise, PE address is
-* meaningless.
+* First of all, use function 1 to determine if this device is
+* part of a PE or not. ret[0] being zero indicates it's not.
 */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
config_addr, BUID_HI(phb->buid),
@@ -431,7 +431,7 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
struct eeh_pe *parent;
 
/* Retrieve PE address */
-   edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
+   edev->pe_config_addr = pseries_eeh_get_pe_config_addr(pdn);
pe.addr = edev->pe_config_addr;
 
/* Some older systems (Power4) allow the ibm,set-eeh-option
@@ -551,64 +551,6 @@ static int pseries_eeh_set_option(struct eeh_pe *pe, int 
option)
return ret;
 }
 
-/**
- * pseries_eeh_get_pe_addr - Retrieve PE address
- * @pe: EEH PE
- *
- * Retrieve the assocated PE address. Actually, there're 2 RTAS
- * function calls dedicated for the purpose. We need implement
- * it through the new function and then the old one. Besides,
- * you should make sure the config address is figured out from
- * FDT node before calling the function.
- *
- * It's notable that zero'ed return value means invalid PE config
- * address.
- */
-static int pseries_eeh_get_pe_addr(struct pci_dn *pdn)
-{
-   int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
-   unsigned long buid = pdn->phb->buid;
-   int ret = 0;
-   int rets[3];
-
-   if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
-   /*
-* First of all, we need to make sure there has one PE
-* associated with the device. Otherwise, PE address is
-* meaningless.
-*/
-   ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-   config_addr, BUID_HI(buid), BUID_LO(buid), 1);
-   if (ret || (rets[0] == 0))
-   return 0;
-
-   /* Retrieve the associated PE config address */
-   ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-   config_addr, BUID_HI(buid), BUID_LO(buid), 0);
-   if (ret) {
-   pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
-   __func__, pdn->phb->global_number, config_addr);
-   return 0;
-   }
-
-   return rets[0];
-   }
-
-   if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
-   ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-   config_addr, BUID_HI(buid), BUID_LO(buid), 0);
-   if (ret) {
-   pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
-   __func__, pdn->phb->global_number, config_addr);
-   return 0;
-   }
-
-   return rets[0];
-   }
-
-   return ret;
-}
-
 /**
  * pseries_eeh_get_state - 

[PATCH 5/9] powerpc/eeh: Move EEH initialisation to an arch initcall

2020-09-09 Thread Oliver O'Halloran
The initialisation of EEH mostly happens in a core_initcall_sync initcall,
followed by registering a bus notifier later on in an arch_initcall.
Anything involving initcall dependecies is mostly incomprehensible unless
you've spent a while staring at code so here's the full sequence:

ppc_md.setup_arch   <-- pci_controllers are created here

...time passes...

core_initcall   <-- pci_dns are created from DT nodes
core_initcall_sync  <-- platforms call eeh_init()
postcore_initcall   <-- PCI bus type is registered
postcore_initcall_sync
arch_initcall   <-- EEH pci_bus notifier registered
subsys_initcall <-- PHBs are scanned here

There's no real requirement to do the EEH setup at the core_initcall_sync
level. It just needs to be done after pci_dn's are created and before we
start scanning PHBs. Simplify the flow a bit by moving the platform EEH
inititalisation to an arch_initcall so we can fold the bus notifier
registration into eeh_init().

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c| 64 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |  2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |  2 +-
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 98faf139e676..c9e25cfce8f0 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -940,6 +940,30 @@ static struct notifier_block eeh_reboot_nb = {
.notifier_call = eeh_reboot_notifier,
 };
 
+static int eeh_device_notifier(struct notifier_block *nb,
+  unsigned long action, void *data)
+{
+   struct device *dev = data;
+
+   switch (action) {
+   /*
+* Note: It's not possible to perform EEH device addition (i.e.
+* {pseries,pnv}_pcibios_bus_add_device()) here because it depends on
+* the device's resources, which have not yet been set up.
+*/
+   case BUS_NOTIFY_DEL_DEVICE:
+   eeh_remove_device(to_pci_dev(dev));
+   break;
+   default:
+   break;
+   }
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_device_nb = {
+   .notifier_call = eeh_device_notifier,
+};
+
 /**
  * eeh_init - System wide EEH initialization
  *
@@ -960,7 +984,14 @@ int eeh_init(struct eeh_ops *ops)
/* Register reboot notifier */
ret = register_reboot_notifier(_reboot_nb);
if (ret) {
-   pr_warn("%s: Failed to register notifier (%d)\n",
+   pr_warn("%s: Failed to register reboot notifier (%d)\n",
+   __func__, ret);
+   return ret;
+   }
+
+   ret = bus_register_notifier(_bus_type, _device_nb);
+   if (ret) {
+   pr_warn("%s: Failed to register bus notifier (%d)\n",
__func__, ret);
return ret;
}
@@ -975,37 +1006,6 @@ int eeh_init(struct eeh_ops *ops)
return eeh_event_init();
 }
 
-static int eeh_device_notifier(struct notifier_block *nb,
-  unsigned long action, void *data)
-{
-   struct device *dev = data;
-
-   switch (action) {
-   /*
-* Note: It's not possible to perform EEH device addition (i.e.
-* {pseries,pnv}_pcibios_bus_add_device()) here because it depends on
-* the device's resources, which have not yet been set up.
-*/
-   case BUS_NOTIFY_DEL_DEVICE:
-   eeh_remove_device(to_pci_dev(dev));
-   break;
-   default:
-   break;
-   }
-   return NOTIFY_DONE;
-}
-
-static struct notifier_block eeh_device_nb = {
-   .notifier_call = eeh_device_notifier,
-};
-
-static __init int eeh_set_bus_notifier(void)
-{
-   bus_register_notifier(_bus_type, _device_nb);
-   return 0;
-}
-arch_initcall(eeh_set_bus_notifier);
-
 /**
  * eeh_probe_device() - Perform EEH initialization for the indicated pci device
  * @dev: pci device for which to set up EEH
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 03e566874595..d03c5873defc 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1721,4 +1721,4 @@ static int __init eeh_powernv_init(void)
 
return ret;
 }
-machine_core_initcall_sync(powernv, eeh_powernv_init);
+machine_arch_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index fd328632..b1561961c7ff 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -989,4 +989,4 @@ static int __init eeh_pseries_init(void)
ret);
return ret;
 }
-machine_core_initcall_sync(pseries, eeh_pseries_init);
+machine_arch_initcall(pseries, eeh_pseries_init);
-- 
2.26.2



[PATCH 4/9] powerpc/eeh: Delete eeh_ops->init

2020-09-09 Thread Oliver O'Halloran
No longer used since the platforms perform their EEH initialisation before
calling eeh_init().

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h | 1 -
 arch/powerpc/kernel/eeh.c  | 8 
 2 files changed, 9 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 765bcf63edea..85030c05e67e 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -216,7 +216,6 @@ enum {
 
 struct eeh_ops {
char *name;
-   int (*init)(void);
struct eeh_dev *(*probe)(struct pci_dev *pdev);
int (*set_option)(struct eeh_pe *pe, int option);
int (*get_state)(struct eeh_pe *pe, int *delay);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 28a0ea5d9faa..98faf139e676 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -965,14 +965,6 @@ int eeh_init(struct eeh_ops *ops)
return ret;
}
 
-   if (eeh_ops->init)
-   ret = eeh_ops->init();
-   if (ret) {
-   pr_warn("%s: platform EEH init failed (%d)\n",
-   __func__, ret);
-   return ret;
-   }
-
/* Initialize PHB PEs */
list_for_each_entry_safe(hose, tmp, _list, list_node)
eeh_phb_pe_create(hose);
-- 
2.26.2



[PATCH 3/9] powerpc/pseries: Stop using eeh_ops->init()

2020-09-09 Thread Oliver O'Halloran
Fold pseries_eeh_init() into eeh_pseries_init() rather than having
eeh_init() call it via eeh_ops->init(). It's simpler and it'll let us
delete eeh_ops.init.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 155 +--
 1 file changed, 71 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 3cc569e8b6d4..fd328632 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -239,88 +239,6 @@ static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(slot_errbuf_lock);
 static int eeh_error_buf_size;
 
-/**
- * pseries_eeh_init - EEH platform dependent initialization
- *
- * EEH platform dependent initialization on pseries.
- */
-static int pseries_eeh_init(void)
-{
-   struct pci_controller *phb;
-   struct pci_dn *pdn;
-   int addr, config_addr;
-
-   /* figure out EEH RTAS function call tokens */
-   ibm_set_eeh_option  = rtas_token("ibm,set-eeh-option");
-   ibm_set_slot_reset  = rtas_token("ibm,set-slot-reset");
-   ibm_read_slot_reset_state2  = 
rtas_token("ibm,read-slot-reset-state2");
-   ibm_read_slot_reset_state   = 
rtas_token("ibm,read-slot-reset-state");
-   ibm_slot_error_detail   = rtas_token("ibm,slot-error-detail");
-   ibm_get_config_addr_info2   = 
rtas_token("ibm,get-config-addr-info2");
-   ibm_get_config_addr_info= 
rtas_token("ibm,get-config-addr-info");
-   ibm_configure_pe= rtas_token("ibm,configure-pe");
-
-   /*
-* ibm,configure-pe and ibm,configure-bridge have the same semantics,
-* however ibm,configure-pe can be faster.  If we can't find
-* ibm,configure-pe then fall back to using ibm,configure-bridge.
-*/
-   if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE)
-   ibm_configure_pe= rtas_token("ibm,configure-bridge");
-
-   /*
-* Necessary sanity check. We needn't check "get-config-addr-info"
-* and its variant since the old firmware probably support address
-* of domain/bus/slot/function for EEH RTAS operations.
-*/
-   if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE  ||
-   ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE  ||
-   (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
-ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) ||
-   ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE   ||
-   ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
-   pr_info("EEH functionality not supported\n");
-   return -EINVAL;
-   }
-
-   /* Initialize error log lock and size */
-   spin_lock_init(_errbuf_lock);
-   eeh_error_buf_size = rtas_token("rtas-error-log-max");
-   if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
-   pr_info("%s: unknown EEH error log size\n",
-   __func__);
-   eeh_error_buf_size = 1024;
-   } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
-   pr_info("%s: EEH error log size %d exceeds the maximal %d\n",
-   __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
-   eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
-   }
-
-   /* Set EEH probe mode */
-   eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
-
-   /* Set EEH machine dependent code */
-   ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
-
-   if (is_kdump_kernel() || reset_devices) {
-   pr_info("Issue PHB reset ...\n");
-   list_for_each_entry(phb, _list, list_node) {
-   pdn = list_first_entry(_DN(phb->dn)->child_list, 
struct pci_dn, list);
-   addr = (pdn->busno << 16) | (pdn->devfn << 8);
-   config_addr = pseries_eeh_get_config_addr(phb, addr);
-   /* invalid PE config addr */
-   if (config_addr == 0)
-   continue;
-
-   pseries_eeh_phb_reset(phb, config_addr, 
EEH_RESET_FUNDAMENTAL);
-   pseries_eeh_phb_reset(phb, config_addr, 
EEH_RESET_DEACTIVATE);
-   pseries_eeh_phb_configure_bridge(phb, config_addr);
-   }
-   }
-
-   return 0;
-}
-
 static int pseries_eeh_cap_start(struct pci_dn *pdn)
 {
u32 status;
@@ -967,7 +885,6 @@ static int pseries_notify_resume(struct eeh_dev *edev)
 
 static struct eeh_ops pseries_eeh_ops = {
.name   = "pseries",
-   .init   = pseries_eeh_init,
.probe  = pseries_eeh_probe,
.set_option = pseries_eeh_set_option,
.get_state  = pseries_eeh_get_state,
@@ -992,7 +909,77 @@ static 

[PATCH 2/9] powerpc/powernv: Stop using eeh_ops->init()

2020-09-09 Thread Oliver O'Halloran
Fold pnv_eeh_init() into eeh_powernv_init() rather than having eeh_init()
call it via eeh_ops->init(). It's simpler and it'll let us delete
eeh_ops.init.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 94 ++--
 1 file changed, 45 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index a550f837ccb5..03e566874595 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -44,54 +44,6 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
eeh_probe_device(pdev);
 }
 
-static int pnv_eeh_init(void)
-{
-   struct pci_controller *hose;
-   struct pnv_phb *phb;
-   int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
-
-   if (!firmware_has_feature(FW_FEATURE_OPAL)) {
-   pr_warn("%s: OPAL is required !\n",
-   __func__);
-   return -EINVAL;
-   }
-
-   /* Set probe mode */
-   eeh_add_flag(EEH_PROBE_MODE_DEV);
-
-   /*
-* P7IOC blocks PCI config access to frozen PE, but PHB3
-* doesn't do that. So we have to selectively enable I/O
-* prior to collecting error log.
-*/
-   list_for_each_entry(hose, _list, list_node) {
-   phb = hose->private_data;
-
-   if (phb->model == PNV_PHB_MODEL_P7IOC)
-   eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
-
-   if (phb->diag_data_size > max_diag_size)
-   max_diag_size = phb->diag_data_size;
-
-   /*
-* PE#0 should be regarded as valid by EEH core
-* if it's not the reserved one. Currently, we
-* have the reserved PE#255 and PE#127 for PHB3
-* and P7IOC separately. So we should regard
-* PE#0 as valid for PHB3 and P7IOC.
-*/
-   if (phb->ioda.reserved_pe_idx != 0)
-   eeh_add_flag(EEH_VALID_PE_ZERO);
-
-   break;
-   }
-
-   eeh_set_pe_aux_size(max_diag_size);
-   ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
-
-   return 0;
-}
-
 static irqreturn_t pnv_eeh_event(int irq, void *data)
 {
/*
@@ -1674,7 +1626,6 @@ static int pnv_eeh_restore_config(struct eeh_dev *edev)
 
 static struct eeh_ops pnv_eeh_ops = {
.name   = "powernv",
-   .init   = pnv_eeh_init,
.probe  = pnv_eeh_probe,
.set_option = pnv_eeh_set_option,
.get_state  = pnv_eeh_get_state,
@@ -1715,8 +1666,53 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, 
pnv_pci_fixup_vf_mps);
  */
 static int __init eeh_powernv_init(void)
 {
+   int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
int ret = -EINVAL;
 
+   if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+   pr_warn("%s: OPAL is required !\n", __func__);
+   return -EINVAL;
+   }
+
+   /* Set probe mode */
+   eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+   /*
+* P7IOC blocks PCI config access to frozen PE, but PHB3
+* doesn't do that. So we have to selectively enable I/O
+* prior to collecting error log.
+*/
+   list_for_each_entry(hose, _list, list_node) {
+   phb = hose->private_data;
+
+   if (phb->model == PNV_PHB_MODEL_P7IOC)
+   eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+   if (phb->diag_data_size > max_diag_size)
+   max_diag_size = phb->diag_data_size;
+
+   /*
+* PE#0 should be regarded as valid by EEH core
+* if it's not the reserved one. Currently, we
+* have the reserved PE#255 and PE#127 for PHB3
+* and P7IOC separately. So we should regard
+* PE#0 as valid for PHB3 and P7IOC.
+*/
+   if (phb->ioda.reserved_pe_idx != 0)
+   eeh_add_flag(EEH_VALID_PE_ZERO);
+
+   break;
+   }
+
+   /*
+* eeh_init() allocates the eeh_pe and its aux data buf so the
+* size needs to be set before calling eeh_init().
+*/
+   eeh_set_pe_aux_size(max_diag_size);
+   ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
+
ret = eeh_init(_eeh_ops);
if (!ret)
pr_info("EEH: PowerNV platform initialized\n");
-- 
2.26.2



[PATCH 1/9] powerpc/eeh: Rework EEH initialisation

2020-09-09 Thread Oliver O'Halloran
Drop the EEH register / unregister ops thing and have the platform pass the
ops structure into eeh_init() directly. This takes one initcall out of the
EEH setup path and it means we're only doing EEH setup on the platforms
which actually support it. It's also less code and generally easier to
follow.

No functional changes.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   |  3 +-
 arch/powerpc/kernel/eeh.c| 87 
 arch/powerpc/platforms/powernv/eeh-powernv.c |  4 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |  5 +-
 4 files changed, 21 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d5f369bcd130..765bcf63edea 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -295,8 +295,7 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 void eeh_show_enabled(void);
-int __init eeh_ops_register(struct eeh_ops *ops);
-int __exit eeh_ops_unregister(const char *name);
+int __init eeh_init(struct eeh_ops *ops);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
 void eeh_addr_cache_init(void);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 94682382fc8c..28a0ea5d9faa 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -929,56 +929,6 @@ void eeh_save_bars(struct eeh_dev *edev)
edev->config_space[1] |= PCI_COMMAND_MASTER;
 }
 
-/**
- * eeh_ops_register - Register platform dependent EEH operations
- * @ops: platform dependent EEH operations
- *
- * Register the platform dependent EEH operation callback
- * functions. The platform should call this function before
- * any other EEH operations.
- */
-int __init eeh_ops_register(struct eeh_ops *ops)
-{
-   if (!ops->name) {
-   pr_warn("%s: Invalid EEH ops name for %p\n",
-   __func__, ops);
-   return -EINVAL;
-   }
-
-   if (eeh_ops && eeh_ops != ops) {
-   pr_warn("%s: EEH ops of platform %s already existing (%s)\n",
-   __func__, eeh_ops->name, ops->name);
-   return -EEXIST;
-   }
-
-   eeh_ops = ops;
-
-   return 0;
-}
-
-/**
- * eeh_ops_unregister - Unreigster platform dependent EEH operations
- * @name: name of EEH platform operations
- *
- * Unregister the platform dependent EEH operation callback
- * functions.
- */
-int __exit eeh_ops_unregister(const char *name)
-{
-   if (!name || !strlen(name)) {
-   pr_warn("%s: Invalid EEH ops name\n",
-   __func__);
-   return -EINVAL;
-   }
-
-   if (eeh_ops && !strcmp(eeh_ops->name, name)) {
-   eeh_ops = NULL;
-   return 0;
-   }
-
-   return -EEXIST;
-}
-
 static int eeh_reboot_notifier(struct notifier_block *nb,
   unsigned long action, void *unused)
 {
@@ -991,25 +941,22 @@ static struct notifier_block eeh_reboot_nb = {
 };
 
 /**
- * eeh_init - EEH initialization
- *
- * Initialize EEH by trying to enable it for all of the adapters in the system.
- * As a side effect we can determine here if eeh is supported at all.
- * Note that we leave EEH on so failed config cycles won't cause a machine
- * check.  If a user turns off EEH for a particular adapter they are really
- * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
- * grant access to a slot if EEH isn't enabled, and so we always enable
- * EEH for all slots/all devices.
+ * eeh_init - System wide EEH initialization
  *
- * The eeh-force-off option disables EEH checking globally, for all slots.
- * Even if force-off is set, the EEH hardware is still enabled, so that
- * newer systems can boot.
+ * It's the platform's job to call this from an arch_initcall().
  */
-static int eeh_init(void)
+int eeh_init(struct eeh_ops *ops)
 {
struct pci_controller *hose, *tmp;
int ret = 0;
 
+   /* the platform should only initialise EEH once */
+   if (WARN_ON(eeh_ops))
+   return -EEXIST;
+   if (WARN_ON(!ops))
+   return -ENOENT;
+   eeh_ops = ops;
+
/* Register reboot notifier */
ret = register_reboot_notifier(_reboot_nb);
if (ret) {
@@ -1018,13 +965,13 @@ static int eeh_init(void)
return ret;
}
 
-   /* call platform initialization function */
-   if (!eeh_ops) {
-   pr_warn("%s: Platform EEH operation not found\n",
-   __func__);
-   return -EEXIST;
-   } else if ((ret = eeh_ops->init()))
+   if (eeh_ops->init)
+   ret = eeh_ops->init();
+   if (ret) {
+   pr_warn("%s: platform EEH init failed (%d)\n",
+   __func__, ret);
return ret;
+   }
 
/* Initialize 

EEH cleanups and reworks

2020-09-09 Thread Oliver O'Halloran
This is really two series joined together since they end up conflicting
with each other lighty slightly and I figured this is easier for all
involved.

Patches 1-5 streamline how the generic and platform specfic parts of EEH
are initialised at boot so more of the setup process happens in linear code
rather in initcalls.

Patches 6-9 re-work how the per-device EEH probing works on pseries to
make it line up better with the process outlined in PAPR. It also
removes the pe->config_addr field which has always confused me since
it's not really clear what or why it's needed (spoiler: it's not).

Oliver





Re: [PATCH] scsi: ibmvfc: Fix error return in ibmvfc_probe()

2020-09-09 Thread Martin K. Petersen


Jing,

> Fix to return error code PTR_ERR() from the error handling case instead
> of 0.

Applied to 5.10/scsi-staging. Thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


[powerpc:merge] BUILD SUCCESS 4b552a4cbf286ff9dcdab19153f3c1c7d1680fab

2020-09-09 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: 4b552a4cbf286ff9dcdab19153f3c1c7d1680fab  Automatic merge of 
'master', 'next' and 'fixes' (2020-09-09 22:40)

elapsed time: 726m

configs tested: 129
configs skipped: 11

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
armzeus_defconfig
powerpc linkstation_defconfig
sparc64 defconfig
h8300   h8s-sim_defconfig
mips  fuloong2e_defconfig
sh apsh4a3a_defconfig
arm  footbridge_defconfig
sh ecovec24_defconfig
mipsmalta_kvm_guest_defconfig
shshmin_defconfig
powerpccell_defconfig
powerpc powernv_defconfig
powerpc  mgcoge_defconfig
armneponset_defconfig
m68kstmark2_defconfig
arm   mainstone_defconfig
mips   mtx1_defconfig
sh  r7785rp_defconfig
arc   tb10x_defconfig
armspear3xx_defconfig
arm   sama5_defconfig
arm   netwinder_defconfig
microblaze  mmu_defconfig
sh   se7721_defconfig
arm lpc18xx_defconfig
mips   jazz_defconfig
m68k   m5249evb_defconfig
pariscgeneric-32bit_defconfig
arm assabet_defconfig
armdove_defconfig
arc  axs101_defconfig
mips   bmips_be_defconfig
powerpc mpc83xx_defconfig
m68k   sun3_defconfig
powerpc mpc512x_defconfig
shsh7757lcr_defconfig
s390 allyesconfig
armcerfcube_defconfig
sh shx3_defconfig
armclps711x_defconfig
arcvdk_hs38_defconfig
openrisc alldefconfig
sh ap325rxa_defconfig
armmvebu_v7_defconfig
arm   spear13xx_defconfig
openrisc simple_smp_defconfig
arm  collie_defconfig
sh   se7750_defconfig
arm hackkit_defconfig
arm  gemini_defconfig
sh microdev_defconfig
arm  ixp4xx_defconfig
sh  landisk_defconfig
arc haps_hs_smp_defconfig
arm   tegra_defconfig
sh   se7206_defconfig
xtensaxip_kc705_defconfig
mips  rb532_defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
m68kdefconfig
m68k allyesconfig
m68k allmodconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
c6x  allyesconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
parisc   allyesconfig
s390defconfig
i386 allyesconfig
sparcallyesconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20200909
x86_64   randconfig-a006-20200909
x86_64   randconfig-a003-20200909
x86_64   randconfig-a001-20200909
x86_64   randconfig-a005-20200909
x86_64

[powerpc:next-test] BUILD SUCCESS a24f73a061494d718b254ec7814cd10010ac2ec3

2020-09-09 Thread kernel test robot
nios2allyesconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
mips allyesconfig
mips allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20200909
x86_64   randconfig-a006-20200909
x86_64   randconfig-a003-20200909
x86_64   randconfig-a001-20200909
x86_64   randconfig-a005-20200909
x86_64   randconfig-a002-20200909
i386 randconfig-a004-20200909
i386 randconfig-a005-20200909
i386 randconfig-a006-20200909
i386 randconfig-a002-20200909
i386 randconfig-a001-20200909
i386 randconfig-a003-20200909
i386 randconfig-a016-20200909
i386 randconfig-a015-20200909
i386 randconfig-a011-20200909
i386 randconfig-a013-20200909
i386 randconfig-a014-20200909
i386 randconfig-a012-20200909
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
x86_64   rhel
x86_64   allyesconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  kexec

clang tested configs:
x86_64   randconfig-a013-20200909
x86_64   randconfig-a016-20200909
x86_64   randconfig-a011-20200909
x86_64   randconfig-a012-20200909
x86_64   randconfig-a015-20200909
x86_64   randconfig-a014-20200909

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


[powerpc:fixes-test] BUILD SUCCESS 0460534b532e5518c657c7d6492b9337d975eaa3

2020-09-09 Thread kernel test robot
 allnoconfig
nds32   defconfig
nios2allyesconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
mips allyesconfig
mips allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20200909
x86_64   randconfig-a006-20200909
x86_64   randconfig-a003-20200909
x86_64   randconfig-a001-20200909
x86_64   randconfig-a005-20200909
x86_64   randconfig-a002-20200909
i386 randconfig-a004-20200909
i386 randconfig-a005-20200909
i386 randconfig-a006-20200909
i386 randconfig-a002-20200909
i386 randconfig-a001-20200909
i386 randconfig-a003-20200909
i386 randconfig-a016-20200909
i386 randconfig-a015-20200909
i386 randconfig-a011-20200909
i386 randconfig-a013-20200909
i386 randconfig-a014-20200909
i386 randconfig-a012-20200909
x86_64   randconfig-a004-20200910
x86_64   randconfig-a006-20200910
x86_64   randconfig-a003-20200910
x86_64   randconfig-a002-20200910
x86_64   randconfig-a005-20200910
x86_64   randconfig-a001-20200910
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
x86_64   rhel
x86_64   allyesconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  kexec

clang tested configs:
x86_64   randconfig-a013-20200909
x86_64   randconfig-a016-20200909
x86_64   randconfig-a011-20200909
x86_64   randconfig-a012-20200909
x86_64   randconfig-a015-20200909
x86_64   randconfig-a014-20200909

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-09 Thread Joe Perches
On Wed, 2020-09-09 at 19:36 -0300, Jason Gunthorpe wrote:
> On Wed, Sep 09, 2020 at 01:06:39PM -0700, Joe Perches wrote:
> > fallthrough to a separate case/default label break; isn't very readable.
> > 
> > Convert pseudo-keyword fallthrough; statements to a simple break; when
> > the next label is case or default and the only statement in the next
> > label block is break;
> > 
> > Found using:
> > 
> > $ grep-2.5.4 -rP --include=*.[ch] -n 
> > "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
> > 
> > Miscellanea:
> > 
> > o Move or coalesce a couple label blocks above a default: block.
> > 
> > Signed-off-by: Joe Perches 
> > ---
> > 
> > Compiled allyesconfig x86-64 only.
> > A few files for other arches were not compiled.
> 
> IB part looks OK, I prefer it like this
> 
> You could do the same for continue as well, I saw a few of those..

I saw some continue uses as well but wasn't sure
and didn't look to see if the switch/case with
continue was in a for/while loop.




Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-09 Thread Jason Gunthorpe
On Wed, Sep 09, 2020 at 01:06:39PM -0700, Joe Perches wrote:
> fallthrough to a separate case/default label break; isn't very readable.
> 
> Convert pseudo-keyword fallthrough; statements to a simple break; when
> the next label is case or default and the only statement in the next
> label block is break;
> 
> Found using:
> 
> $ grep-2.5.4 -rP --include=*.[ch] -n 
> "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
> 
> Miscellanea:
> 
> o Move or coalesce a couple label blocks above a default: block.
> 
> Signed-off-by: Joe Perches 
> ---
> 
> Compiled allyesconfig x86-64 only.
> A few files for other arches were not compiled.

IB part looks OK, I prefer it like this

You could do the same for continue as well, I saw a few of those..

Thanks,
Jason


Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-09 Thread Keith Busch
On Wed, Sep 09, 2020 at 01:06:39PM -0700, Joe Perches wrote:
> diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
> index eea0f453cfb6..8aac5bc60f4c 100644
> --- a/crypto/tcrypt.c
> +++ b/crypto/tcrypt.c
> @@ -2464,7 +2464,7 @@ static int do_test(const char *alg, u32 type, u32 mask, 
> int m, u32 num_mb)
>   test_hash_speed("streebog512", sec,
>   generic_hash_speed_template);
>   if (mode > 300 && mode < 400) break;
> - fallthrough;
> + break;
>   case 399:
>   break;

Just imho, this change makes the preceding 'if' look even more
pointless. Maybe the fallthrough was a deliberate choice? Not that my
opinion matters here as I don't know this module, but it looked a bit
odd to me.


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-09 Thread Linus Torvalds
On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
 wrote:
>
> It will not work like this in GCC, no.  The LLVM people know about that.
> I do not know why they insist on pushing this, being incompatible and
> everything.

Umm. Since they'd be the ones supporting this, *gcc* would be the
incompatible one, not clang.

Like it or not, clang is becoming a major kernel compiler. It's
already basically used for all android uses afaik.

So I'd phrase it differently. If gcc is planning on doing some
different model for asm goto with outputs, that would be the
incompatible case.

I'm not sure how gcc could do it differently. The only possible
difference I see is

 (a) not doing it at all

 (b) doing the "all goto targets have the outputs" case

and honestly, (b) is actually inferior for the error cases, even if to
a compiler person it might feel like the "RightThing(tm)" to do.
Because when an exception happens, the outputs simply won't be
initialized.

Anyway, for either of those cases, the kernel won't care either way.
We'll have to support the non-goto case for many years even if
everybody were to magically implement it today, so it's not like this
is a "you have to do it" thing.

   Linus


Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-09 Thread Gustavo A. R. Silva



On 9/9/20 15:06, Joe Perches wrote:
> fallthrough to a separate case/default label break; isn't very readable.
> 
> Convert pseudo-keyword fallthrough; statements to a simple break; when
> the next label is case or default and the only statement in the next
> label block is break;
> 
> Found using:
> 
> $ grep-2.5.4 -rP --include=*.[ch] -n 
> "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
> 
> Miscellanea:
> 
> o Move or coalesce a couple label blocks above a default: block.
> 
> Signed-off-by: Joe Perches 

Acked-by: Gustavo A. R. Silva 

Thanks
--
Gustavo

> ---
> 
> Compiled allyesconfig x86-64 only.
> A few files for other arches were not compiled.
> 
>  arch/arm/mach-mmp/pm-pxa910.c |  2 +-
>  arch/arm64/kvm/handle_exit.c  |  2 +-
>  arch/mips/kernel/cpu-probe.c  |  2 +-
>  arch/mips/math-emu/cp1emu.c   |  2 +-
>  arch/s390/pci/pci.c   |  2 +-
>  crypto/tcrypt.c   |  4 ++--
>  drivers/ata/sata_mv.c |  2 +-
>  drivers/atm/lanai.c   |  2 +-
>  drivers/gpu/drm/i915/display/intel_sprite.c   |  2 +-
>  drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmi.c   |  2 +-
>  drivers/hid/wacom_wac.c   |  2 +-
>  drivers/i2c/busses/i2c-i801.c |  2 +-
>  drivers/infiniband/ulp/rtrs/rtrs-clt.c| 14 +++---
>  drivers/infiniband/ulp/rtrs/rtrs-srv.c|  6 +++---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  2 +-
>  drivers/irqchip/irq-vic.c |  4 ++--
>  drivers/md/dm.c   |  2 +-
>  drivers/media/dvb-frontends/drxd_hard.c   |  2 +-
>  drivers/media/i2c/ov5640.c|  2 +-
>  drivers/media/i2c/ov6650.c|  5 ++---
>  drivers/media/i2c/smiapp/smiapp-core.c|  2 +-
>  drivers/media/i2c/tvp5150.c   |  2 +-
>  drivers/media/pci/ddbridge/ddbridge-core.c|  2 +-
>  drivers/media/usb/cpia2/cpia2_core.c  |  2 +-
>  drivers/mfd/iqs62x.c  |  3 +--
>  drivers/mmc/host/atmel-mci.c  |  2 +-
>  drivers/mtd/nand/raw/nandsim.c|  2 +-
>  drivers/net/ethernet/intel/e1000e/phy.c   |  2 +-
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c   |  2 +-
>  drivers/net/ethernet/intel/i40e/i40e_adminq.c |  2 +-
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  2 +-
>  drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  2 +-
>  drivers/net/ethernet/intel/igb/e1000_phy.c|  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c|  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c|  2 +-
>  drivers/net/ethernet/intel/ixgbevf/vf.c   |  2 +-
>  drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c |  2 +-
>  drivers/net/ethernet/qlogic/qed/qed_mcp.c |  2 +-
>  drivers/net/ethernet/sfc/falcon/farch.c   |  2 +-
>  drivers/net/ethernet/sfc/farch.c  |  2 +-
>  drivers/net/phy/adin.c|  3 +--
>  drivers/net/usb/pegasus.c |  4 ++--
>  drivers/net/usb/usbnet.c  |  2 +-
>  drivers/net/wireless/ath/ath5k/eeprom.c   |  2 +-
>  drivers/net/wireless/mediatek/mt7601u/dma.c   |  8 
>  drivers/nvme/host/core.c  | 12 ++--
>  drivers/pcmcia/db1xxx_ss.c|  4 ++--
>  drivers/power/supply/abx500_chargalg.c|  2 +-
>  drivers/power/supply/charger-manager.c|  2 +-
>  drivers/rtc/rtc-pcf85063.c|  2 +-
>  drivers/s390/scsi/zfcp_fsf.c  |  2 +-
>  drivers/scsi/aic7xxx/aic79xx_core.c   |  4 ++--
>  drivers/scsi/aic94xx/aic94xx_tmf.c|  2 +-
>  drivers/scsi/lpfc/lpfc_sli.c  |  2 +-
>  drivers/scsi/smartpqi/smartpqi_init.c |  2 +-
>  drivers/scsi/sr.c |  2 +-
>  drivers/tty/serial/sunsu.c|  2 +-
>  drivers/tty/serial/sunzilog.c |  2 +-
>  drivers/tty/vt/vt_ioctl.c |  2 +-
>  drivers/usb/dwc3/core.c   |  2 +-
>  drivers/usb/gadget/legacy/inode.c | 

[trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-09 Thread Joe Perches
fallthrough to a separate case/default label break; isn't very readable.

Convert pseudo-keyword fallthrough; statements to a simple break; when
the next label is case or default and the only statement in the next
label block is break;

Found using:

$ grep-2.5.4 -rP --include=*.[ch] -n 
"fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *

Miscellanea:

o Move or coalesce a couple label blocks above a default: block.

Signed-off-by: Joe Perches 
---

Compiled allyesconfig x86-64 only.
A few files for other arches were not compiled.

 arch/arm/mach-mmp/pm-pxa910.c |  2 +-
 arch/arm64/kvm/handle_exit.c  |  2 +-
 arch/mips/kernel/cpu-probe.c  |  2 +-
 arch/mips/math-emu/cp1emu.c   |  2 +-
 arch/s390/pci/pci.c   |  2 +-
 crypto/tcrypt.c   |  4 ++--
 drivers/ata/sata_mv.c |  2 +-
 drivers/atm/lanai.c   |  2 +-
 drivers/gpu/drm/i915/display/intel_sprite.c   |  2 +-
 drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmi.c   |  2 +-
 drivers/hid/wacom_wac.c   |  2 +-
 drivers/i2c/busses/i2c-i801.c |  2 +-
 drivers/infiniband/ulp/rtrs/rtrs-clt.c| 14 +++---
 drivers/infiniband/ulp/rtrs/rtrs-srv.c|  6 +++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  2 +-
 drivers/irqchip/irq-vic.c |  4 ++--
 drivers/md/dm.c   |  2 +-
 drivers/media/dvb-frontends/drxd_hard.c   |  2 +-
 drivers/media/i2c/ov5640.c|  2 +-
 drivers/media/i2c/ov6650.c|  5 ++---
 drivers/media/i2c/smiapp/smiapp-core.c|  2 +-
 drivers/media/i2c/tvp5150.c   |  2 +-
 drivers/media/pci/ddbridge/ddbridge-core.c|  2 +-
 drivers/media/usb/cpia2/cpia2_core.c  |  2 +-
 drivers/mfd/iqs62x.c  |  3 +--
 drivers/mmc/host/atmel-mci.c  |  2 +-
 drivers/mtd/nand/raw/nandsim.c|  2 +-
 drivers/net/ethernet/intel/e1000e/phy.c   |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c   |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq.c |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  2 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  2 +-
 drivers/net/ethernet/intel/igb/e1000_phy.c|  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c|  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c|  2 +-
 drivers/net/ethernet/intel/ixgbevf/vf.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c |  2 +-
 drivers/net/ethernet/sfc/falcon/farch.c   |  2 +-
 drivers/net/ethernet/sfc/farch.c  |  2 +-
 drivers/net/phy/adin.c|  3 +--
 drivers/net/usb/pegasus.c |  4 ++--
 drivers/net/usb/usbnet.c  |  2 +-
 drivers/net/wireless/ath/ath5k/eeprom.c   |  2 +-
 drivers/net/wireless/mediatek/mt7601u/dma.c   |  8 
 drivers/nvme/host/core.c  | 12 ++--
 drivers/pcmcia/db1xxx_ss.c|  4 ++--
 drivers/power/supply/abx500_chargalg.c|  2 +-
 drivers/power/supply/charger-manager.c|  2 +-
 drivers/rtc/rtc-pcf85063.c|  2 +-
 drivers/s390/scsi/zfcp_fsf.c  |  2 +-
 drivers/scsi/aic7xxx/aic79xx_core.c   |  4 ++--
 drivers/scsi/aic94xx/aic94xx_tmf.c|  2 +-
 drivers/scsi/lpfc/lpfc_sli.c  |  2 +-
 drivers/scsi/smartpqi/smartpqi_init.c |  2 +-
 drivers/scsi/sr.c |  2 +-
 drivers/tty/serial/sunsu.c|  2 +-
 drivers/tty/serial/sunzilog.c |  2 +-
 drivers/tty/vt/vt_ioctl.c |  2 +-
 drivers/usb/dwc3/core.c   |  2 +-
 drivers/usb/gadget/legacy/inode.c |  2 +-
 drivers/usb/gadget/udc/pxa25x_udc.c   |  4 ++--
 drivers/usb/host/ohci-hcd.c   |  2 +-
 drivers/usb/isp1760/isp1760-hcd.c |  2 +-
 drivers/usb/musb/cppi_dma.c

Re: [PATCH] scsi: ibmvfc: Fix error return in ibmvfc_probe()

2020-09-09 Thread Tyrel Datwyler
On 9/7/20 1:39 AM, Jing Xiangfeng wrote:
> Fix to return error code PTR_ERR() from the error handling case instead
> of 0.
> 
> Signed-off-by: Jing Xiangfeng 

Acked-by: Tyrel Datwyler 

> ---
>  drivers/scsi/ibmvscsi/ibmvfc.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
> index ea7c8930592d..70daa0605082 100644
> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
> @@ -4928,6 +4928,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const 
> struct vio_device_id *id)
>   if (IS_ERR(vhost->work_thread)) {
>   dev_err(dev, "Couldn't create kernel thread: %ld\n",
>   PTR_ERR(vhost->work_thread));
> + rc = PTR_ERR(vhost->work_thread);
>   goto free_host_mem;
>   }
>  
> 



Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-09 Thread Segher Boessenkool
On Wed, Sep 09, 2020 at 10:31:34AM -0700, Linus Torvalds wrote:
> And apparently there are people working on this on the gcc side too,
> so it won't just be clang-specific. Nor kernel-specific in that Nick
> tells me some other projects are looking at using that asm goto with
> outputs too.

It will not work like this in GCC, no.  The LLVM people know about that.
I do not know why they insist on pushing this, being incompatible and
everything.


Segher


Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-09 Thread Jason Gunthorpe
On Wed, Sep 09, 2020 at 07:25:34PM +0200, Gerald Schaefer wrote:
> I actually had to draw myself a picture to get some hold of
> this, or rather a walk-through with a certain pud-crossing
> range in a folded 3-level scenario. Not sure if I would have
> understood my explanation above w/o that, but I hope you can
> make some sense out of it. Or draw yourself a picture :-)

What I don't understand is how does anything work with S390 today?

If the fix is only to change pxx_addr_end() then than generic code
like mm/pagewalk.c will iterate over a *different list* of page table
entries. 

It's choice of entries to look at is entirely driven by pxx_addr_end().

Which suggest to me that mm/pagewalk.c also doesn't work properly
today on S390 and this issue is not really about stack variables?

Fundamentally if pXX_offset() and pXX_addr_end() must be consistent
together, if pXX_offset() is folded then pXX_addr_end() must cause a
single iteration of that level.

Jason


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-09 Thread Linus Torvalds
On Thu, Sep 3, 2020 at 7:28 AM Al Viro  wrote:
>
> I can live with this series; do you want that in vfs.git#for-next?

Well, it's apparently there now (at least it's in your base.set_fs
branch, I didn't check actual -next).

So this is just a heads-up that I plan to merge the "asm goto" changes
on top of this during 5.10. Nick did the patch to make my patch-set
work either with or without the actual asm goto support, and I've been
running it privately now for several months.

And apparently there are people working on this on the gcc side too,
so it won't just be clang-specific. Nor kernel-specific in that Nick
tells me some other projects are looking at using that asm goto with
outputs too.

Anyway, the actual patch to use asm goto with outputs is fairly small
and not that interesting to people (since no released compiler
supports it), but part of the infrastructure to make it tiny is to
just get rid of the inlined "__get_user()" and "__put_user()" stuff.
I've ranted against those functions for a few years by now, so part of
this is to stop inlining them and make people think they are "good",
but part of it is also that those macros and inline functions are the
main remaining ones that mess with this all.

I'm attaching the two __get_user/__put_user patches here in case
anybody cares, but these are the pre-rebased ones, I'll make them work
with the new world order as it happens. The main change is:

 (a) unify around a common special calling convention:
   - %bx is clobbered
   - %cx contains the user address on input, and the error value on output
   - %ax/%dx contains the actual value (input for put, output for get,
of course)

 (b) unify around using just a "call", using the model that
get/put_user already did.
   - use "*_nocheck" for the double-underscore versions
   - this still has to use inline asm because the calling convention is odd
   - otherwise basically just a "call __{get,put}_user_[nocheck_]X"
where X is the size.

IOW, we unify around one single calling convention., and one single
model for actually getting things done.

I still want to remove the double-underscore versions entirely some
day - they have absolutely zero advantages compared to the full "do
address_ok as part of the operation" - but that's a separate thing. At
least they can be unified.

And the reason for this all is obviously that now *only* the
"unsafe_{get,put}_user()" cases with the error label output are the
"fast inlined" cases. They are the only ones that _can_ be done
quickly inline, since the slow clac/stac is not part of them. Plus
they already have that unified usage model of the error label, even if
unsafe_get_user() currently does it manually because "asm goto" with
outputs doesn't work in existing compilers.

Comments?

I suspect people won't care, but I thought I'd post these so that
there won't be any surprises during the next merge window when I apply
them after merging the set_fs() removal branch..

 Linus
From 52c7574a0d15722df52158a3d766803662d9a6ff Mon Sep 17 00:00:00 2001
From: Linus Torvalds 
Date: Wed, 8 Apr 2020 12:50:01 -0700
Subject: [PATCH 1/6] x86: Make __get_user() generate an out-of-line call

Instead of inlining the whole stac/lfence/mov/clac sequence (which also
requires individual exception table entries and several asm instruction
alternatives entries), just generate "call __get_user_nocheck_X" for the
__get_user() cases.

We can use all the same infrastructure that we already do for the
regular "get_user()", and the end result is simpler source code, and
much simpler code generation.

It also measn that when I introduce asm goto with input for
"unsafe_get_user()", there are no nasty interactions with the
__get_user() code.

Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Peter Zijlstra 
Signed-off-by: Linus Torvalds 
---
 arch/x86/include/asm/uaccess.h | 128 ++---
 arch/x86/lib/getuser.S |  60 
 2 files changed, 114 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ecefaffd15d4..cf5a3f61db3b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -96,25 +96,14 @@ static inline bool pagefault_disabled(void);
 	likely(!__range_not_ok(addr, size, user_addr_max()));		\
 })
 
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * This gets kind of ugly. We want to return _two_ values in "get_user()"
- * and yet we don't want to do any pointers, because that is too much
- * of a performance impact. Thus we have a few rather ugly macros here,
- * and hide all the ugliness from the user.
- *
- * The "__xxx" versions of the user access functions are versions that
- * do not verify the address space, that must have been done previously
- * with a separate "access_ok()" call (this is used when we do multiple
- * 

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-09 Thread Gerald Schaefer
On Wed, 9 Sep 2020 09:18:46 -0700
Dave Hansen  wrote:

> On 9/9/20 5:29 AM, Gerald Schaefer wrote:
> > This only works well as long there are real pagetable pointers involved,
> > that can also be used for iteration. For gup_fast, or any other future
> > pagetable walkers using the READ_ONCE logic w/o lock, that is not true.
> > There are pointers involved to local pXd values on the stack, because of
> > the READ_ONCE logic, and our middle-level iteration will suddenly iterate
> > over such stack pointers instead of pagetable pointers.
> 
> By "There are pointers involved to local pXd values on the stack", did
> you mean "locate" instead of "local"?  That sentence confused me.
> 
> Which code is it, exactly that allocates these troublesome on-stack pXd
> values, btw?

It is the gup_pXd_range() call sequence in mm/gup.c. It starts in
gup_pgd_range() with "pgdp = pgd_offset(current->mm, addr)" and then
the "pgd_t pgd = READ_ONCE(*pgdp)" which creates the first local
stack variable "pgd".

The next-level call to gup_p4d_range() gets this "pgd" value as
input, but not the original pgdp pointer where it was read from.
This is already the essential difference to other pagetable walkers
like e.g. walk_pXd_range() in mm/pagewalk.c, where the original
pointer is passed through. With READ_ONCE, that pointer must not
be further de-referenced, so instead the value is passed over.

In gup_p4d_range() we then have "p4dp = p4d_offset(, addr)",
with  being a pointer to the passed over pgd value, so that's
the first pXd pointer that does not point directly to the pXd in
the page table, but a local stack variable.

With folded p4d, p4d_offset(, addr) will simply return
the passed-in  pointer, so we now also have p4dp point to that.
That continues with "p4d_t p4d = READ_ONCE(*p4dp)", and that second
stack variable passed to gup_huge_pud() and so on. Due to inlining,
all those variables will not really be passed anywhere, but simply
sit on the stack.

So far, IIUC, that would also happen on x86 (or everywhere else
actually) for folded levels, i.e. some pXd_offset() calls would
simply return the passed in (stack) value pointer. This works
as designed, and it will not lead to the "iteration over stack
pointer" for anybody but s390, because the pXd_addr_end()
boundaries usually take care that you always return to pgd
level for iteration, and that is the only level with a real
pagetable pointer. For s390, we stay at the first non-folded
level and do the iteration there, which is fine for other
pagetable walkers using the original pointers, but not for
the READ_ONCE-style gup_fast.

I actually had to draw myself a picture to get some hold of
this, or rather a walk-through with a certain pud-crossing
range in a folded 3-level scenario. Not sure if I would have
understood my explanation above w/o that, but I hope you can
make some sense out of it. Or draw yourself a picture :-)


Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-09 Thread Dave Hansen
On 9/9/20 5:29 AM, Gerald Schaefer wrote:
> This only works well as long there are real pagetable pointers involved,
> that can also be used for iteration. For gup_fast, or any other future
> pagetable walkers using the READ_ONCE logic w/o lock, that is not true.
> There are pointers involved to local pXd values on the stack, because of
> the READ_ONCE logic, and our middle-level iteration will suddenly iterate
> over such stack pointers instead of pagetable pointers.

By "There are pointers involved to local pXd values on the stack", did
you mean "locate" instead of "local"?  That sentence confused me.

Which code is it, exactly that allocates these troublesome on-stack pXd
values, btw?

> This will be addressed by making the pXd_addr_end() dynamic, for which
> we need to see the pXd value in order to determine its level / type.

Thanks for the explanation!


Re: [RFC PATCH v2 0/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-09 Thread Gerald Schaefer
On Tue, 8 Sep 2020 19:36:50 +0200
Gerald Schaefer  wrote:

[..]
> 
> It seems now that the generalization is very well accepted so far,
> apart from some apparent issues on arm. Also, merging 2 + 3 and
> putting them first seems to be acceptable, so we could do that for
> v3, if there are no objections.
> 
> Of course, we first need to address the few remaining issues for
> arm(32?), which do look quite confusing to me so far. BTW, sorry for
> the compile error with patch 3, I guess we did the cross-compile only
> for 1 + 2 applied, to see the bloat-o-meter changes. But I guess
> patch 3 already proved its usefulness by that :-)

Umm, replace "arm" with "power", sorry. No issues on arm so far, but
also no ack I think.

Thanks to Christophe for the power change, and to Mike for volunteering
for some cross compilation and cross-arch testing. Will send v3 with
merged and re-ordered patches after some more testing.


Re: [PATCH v3] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-09-09 Thread Michael Ellerman
On Thu, 13 Aug 2020 10:11:31 -0500, Nathan Lynch wrote:
> The drmem lmb list can have hundreds of thousands of entries, and
> unfortunately lookups take the form of linear searches. As long as
> this is the case, traversals have the potential to monopolize the CPU
> and provoke lockup reports, workqueue stalls, and the like unless
> they explicitly yield.
> 
> Rather than placing cond_resched() calls within various
> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
> expression of the loop macro itself so users can't omit it.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
  https://git.kernel.org/powerpc/c/9d6792ffe140240ae54c881cc4183f9acc24b4df

cheers


Re: [PATCH 0/5] powerpc: Remove five unused variables

2020-09-09 Thread Michael Ellerman
On Tue, 19 Nov 2019 14:14:29 +0800, zhengbin wrote:
> zhengbin (5):
>   powerpc/fadump: Remove set but not used variable 'elf'
>   powerpc/perf: Remove set but not used variable 'target'
>   powerpc/powernv: Remove set but not used variable 'total_vfs'
>   powerpc/powernv: Remove set but not used variable 'pdn'
>   powerpc/powernv: Remove set but not used variable 'parent'
> 
> [...]

Patches 1, 2 & 5 applied to powerpc/next.

[1/5] powerpc/fadump: Remove set but not used variable 'elf'
  https://git.kernel.org/powerpc/c/738e6cad0ace88edec8f4ffa082749ad5df26409
[2/5] powerpc/perf: Remove set but not used variable 'target'
  https://git.kernel.org/powerpc/c/ef23cf9a89a7aec19a29d548d1e219d436b23b6e
[5/5] powerpc/powernv: Remove set but not used variable 'parent'
  https://git.kernel.org/powerpc/c/18102e4bcc47f5b5ac70e2e4461d022c1ee6df24

cheers


Re: [PATCH v2 0/4] ocxl: Cleanup AFU interrupt allocation

2020-09-09 Thread Michael Ellerman
On Fri, 3 Apr 2020 17:38:34 +0200, Frederic Barrat wrote:
> Short series to cleanup AFU interrupt allocation for opencapi.
> Current code was using its own allocation service, calling opal
> directly to get the trigger page. This is not needed and we can use
> xive to achieve the same thing. The only caveat is that the trigger
> page address is only valid after the interrupt has been mapped, but
> that is not a problem with the way the code is using it.
> No functional change.
> 
> [...]

Applied to powerpc/next.

[1/4] scsi: cxlflash: Access interrupt trigger page from xive directly
  https://git.kernel.org/powerpc/c/1e89da5ef9c28c673e86048c89ef9495618d987d
[2/4] ocxl: Access interrupt trigger page from xive directly
  https://git.kernel.org/powerpc/c/ad857d47df6a1adc9798558701dd5426643b859f
[3/4] ocxl: Don't return trigger page when allocating an interrupt
  https://git.kernel.org/powerpc/c/dde6f18a8779dcd88d9fd5d6336032fee7e07fcd
[4/4] ocxl: Remove custom service to allocate interrupts
  https://git.kernel.org/powerpc/c/374f6178f3483dcad151fc14b2fad92ed6652f07

cheers


Re: [PATCH] powerpc: hwrng; fix missing of_node_put()

2020-09-09 Thread Michael Ellerman
On Mon, 2 Jul 2018 11:08:16 +0200, Nicholas Mc Guire wrote:
>  The call to of_find_compatible_node() returns a node pointer with refcount
> incremented thus it must be explicitly decremented here before returning.

Applied to powerpc/next.

[1/1] powerpc/pseries: Fix missing of_node_put() in rng_init()
  https://git.kernel.org/powerpc/c/67c3e59443f5fc77be39e2ce0db75fbfa78c7965

cheers


Re: [PATCH] powerpc: icp-hv: fix missing of_node_put in success path

2020-09-09 Thread Michael Ellerman
On Wed, 4 Jul 2018 10:03:27 +0200, Nicholas Mc Guire wrote:
>  Both of_find_compatible_node() and of_find_node_by_type() will
> return a refcounted node on success - thus for the success path
> the node must be explicitly released with a of_node_put().

Applied to powerpc/next.

[1/1] powerpc/icp-hv: Fix missing of_node_put() in success path
  https://git.kernel.org/powerpc/c/d3e669f31ec35856f5e85df9224ede5bdbf1bc7b

cheers


Re: [PATCH 1/6] powerpc/powernv/smp: Fix spurious DBG() warning

2020-09-09 Thread Michael Ellerman
On Tue, 4 Aug 2020 10:54:05 +1000, Oliver O'Halloran wrote:
> When building with W=1 we get the following warning:
> 
>  arch/powerpc/platforms/powernv/smp.c: In function 
> ‘pnv_smp_cpu_kill_self’:
>  arch/powerpc/platforms/powernv/smp.c:276:16: error: suggest braces around
>   empty body in an ‘if’ statement [-Werror=empty-body]
>276 |  cpu, srr1);
>|^
>  cc1: all warnings being treated as errors
> 
> [...]

Applied to powerpc/next.

[1/6] powerpc/powernv/smp: Fix spurious DBG() warning
  https://git.kernel.org/powerpc/c/f6bac19cf65c5be21d14a0c9684c8f560f2096dd
[2/6] powerpc/powernv: Include asm/powernv.h from the local powernv.h
  https://git.kernel.org/powerpc/c/8471c1dd93de9a2278d41c527b76291e4ace8f1c
[3/6] powerpc/powernv: Staticify functions without prototypes
  https://git.kernel.org/powerpc/c/3b70464aa78917e88c1d4bfc2100c344c0eda8e0
[4/6] powerpc/powernv: Fix spurious kerneldoc warnings in opal-prd.c
  https://git.kernel.org/powerpc/c/fb248c3121af713f31736af359608491544cfc23
[5/6] powerpc/powernv: Remove set but not used variable 'parent'
  https://git.kernel.org/powerpc/c/18102e4bcc47f5b5ac70e2e4461d022c1ee6df24
[6/6] powerpc/nx: Don't pack struct coprocessor_request_block
  https://git.kernel.org/powerpc/c/3ced132a055c4e5046d21732393ae6848ff309e0

cheers


Re: [PATCH] cxl: Rework error message for incompatible slots

2020-09-09 Thread Michael Ellerman
On Tue, 7 Apr 2020 13:56:01 +0200, Frederic Barrat wrote:
> Improve the error message shown if a capi adapter is plugged on a
> capi-incompatible slot directly under the PHB (no intermediate switch).

Applied to powerpc/next.

[1/1] cxl: Rework error message for incompatible slots
  https://git.kernel.org/powerpc/c/40ac790d99c6dd16b367d5c2339e446a5f1b0593

cheers


Re: [PATCH] powerpc/oprofile: fix spelling mistake "contex" -> "context"

2020-09-09 Thread Michael Ellerman
On Tue, 4 Aug 2020 18:43:16 +0100, Colin King wrote:
> There is a spelling mistake in a pr_debug message. Fix it.

Applied to powerpc/next.

[1/1] powerpc/oprofile: fix spelling mistake "contex" -> "context"
  https://git.kernel.org/powerpc/c/346427e668163e85cbbe14e4d9a2ddd49df1536c

cheers


Re: [v3 1/2] dts: ppc: t4240rdb: remove interrupts property

2020-09-09 Thread Michael Ellerman
On Wed, 27 May 2020 11:42:27 +0800, Biwen Li wrote:
> Since the interrupt pin for RTC DS1374 is not connected
> to the CPU on T4240RDB, remove the interrupt property
> from the device tree.
> 
> This also fix the following warning for hwclock.util-linux:
> $ hwclock.util-linux
> hwclock.util-linux: select() to /dev/rtc0
> to wait for clock tick timed out

Applied to powerpc/next.

[1/2] powerpc/dts/t4240rdb: remove interrupts property
  https://git.kernel.org/powerpc/c/8c7614d648037b0776e0b76cb62911be3b059ea4
[2/2] powerc/dtc/t1024rdb: remove interrupts property
  https://git.kernel.org/powerpc/c/843dc8ee23d1b353fa9cc24da3e52be0111d5931

cheers


Re: [PATCH 1/2] powerpc/vmemmap: Fix memory leak with vmemmap list allocation failures.

2020-09-09 Thread Michael Ellerman
On Fri, 31 Jul 2020 17:04:59 +0530, Aneesh Kumar K.V wrote:
> If we fail to allocate vmemmap list, we don't keep track of allocated
> vmemmap block buf. Hence on section deactivate we skip vmemmap block
> buf free. This results in memory leak.

Applied to powerpc/next.

[1/2] powerpc/vmemmap: Fix memory leak with vmemmap list allocation failures.
  https://git.kernel.org/powerpc/c/ccaea15296f9773abd43aaa17ee4b88848e4a505
[2/2] powerpc/vmemmap: Don't warn if we don't find a mapping vmemmap list entry
  https://git.kernel.org/powerpc/c/1c0a7ac0ec63ee626f669c9a4e278f6ae1dbfcf2

cheers


Re: [PATCH] arch/powerpc: use simple i2c probe function

2020-09-09 Thread Michael Ellerman
On Fri, 7 Aug 2020 17:27:13 +0200, Stephen Kitt wrote:
> The i2c probe functions here don't use the id information provided in
> their second argument, so the single-parameter i2c probe function
> ("probe_new") can be used instead.
> 
> This avoids scanning the identifier tables during probes.

Applied to powerpc/next.

[1/1] powerpc: Use simple i2c probe function
  https://git.kernel.org/powerpc/c/6c9100ea39d209e1625ba0fe06134192d9c4752a

cheers


Re: [PATCH 0/2] powerpc: unrel_branch_check.sh: enable llvm-objdump

2020-09-09 Thread Michael Ellerman
On Wed, 12 Aug 2020 18:10:34 +1000, Stephen Rothwell wrote:
> These 2 patches enable this script to work properly when llvm-objtool
> is being used.
> 
> They depend on my previos series that make this script suck less.

Applied to powerpc/next.

[1/2] powerpc: unrel_branch_check.sh: use nm to find symbol value
  https://git.kernel.org/powerpc/c/b71dca9891b330d5c2d3ff5d41704aa6f64f8e32
[2/2] powerpc: unrel_branch_check.sh: enable the use of llvm-objdump v9, 10 or 
11
  https://git.kernel.org/powerpc/c/6b1992bcdee8b86a74362192d4d8906731918bcc

cheers


Re: [PATCH 0/7] powerpc: unrel_branch_check.sh: make it suck less

2020-09-09 Thread Michael Ellerman
On Wed, 12 Aug 2020 00:04:27 +1000, Stephen Rothwell wrote:
> Michael Ellerman: "who wants to make
> arch/powerpc/tools/unrel_branch_check.sh suck less"
> 
> This series is based off the current powerpc/next branch and keeps the
> same functionaity as the original except that it suppresses some error
> messages for early failures that still cause this script to succeed
> (as it always did).
> 
> [...]

Applied to powerpc/next.

[1/7] powerpc: unrel_branch_check.sh: fix shellcheck complaints
  https://git.kernel.org/powerpc/c/d9de6b0da85c9f51734f7648f6e860b89f94c801
[2/7] powerpc: unrel_branch_check.sh: simplify and combine some executions
  https://git.kernel.org/powerpc/c/20ff8ec182160df86571a8af5773ff1e52837d73
[3/7] powerpc: unrel_branch_check.sh: simplify objdump's asm output
  https://git.kernel.org/powerpc/c/4e71106c343c625c0bf72a65b244e35e7d2cd037
[4/7] powerpc: unrel_branch_check.sh: convert grep | sed | awk to just sed
  https://git.kernel.org/powerpc/c/3d97abbc9f6fe90973551f3e3eef47ffef863114
[5/7] powerpc: unrel_branch_check.sh: simplify and tidy up the final loop
  https://git.kernel.org/powerpc/c/b84eaab6ede6477484edc043456cf7d7cfc7f8b3
[6/7] powerpc: unrel_branch_check.sh: fix up the file header
  https://git.kernel.org/powerpc/c/3745ae63b405b09c86718f95d96c4b2d2827b087
[7/7] powerpc: unrel_branch_check.sh: exit silently for early errors
  https://git.kernel.org/powerpc/c/af13a2244d59c4d63a25abd8257cbaef9d9ffebc

cheers


Re: [PATCH] powerpc/tools: Remove 90 line limit in checkpatch script

2020-09-09 Thread Michael Ellerman
On Fri, 28 Aug 2020 12:05:42 +1000, Russell Currey wrote:
> As of commit bdc48fa11e46, scripts/checkpatch.pl now has a default line
> length warning of 100 characters.  The powerpc wrapper script was using
> a length of 90 instead of 80 in order to make checkpatch less
> restrictive, but now it's making it more restrictive instead.
> 
> I think it makes sense to just use the default value now.

Applied to powerpc/next.

[1/1] powerpc/tools: Remove 90 line limit in checkpatch script
  https://git.kernel.org/powerpc/c/0fb4871bcc8997acbb8edf14b301fc150101d6c0

cheers


Re: [PATCH] powerpc/64s: handle ISA v3.1 local copy-paste context switches

2020-09-09 Thread Michael Ellerman
On Tue, 25 Aug 2020 17:55:35 +1000, Nicholas Piggin wrote:
> The ISA v3.1 the copy-paste facility has a new memory move functionality
> which allows the copy buffer to be pasted to domestic memory (RAM) as
> opposed to foreign memory (accelerator).
> 
> This means the POWER9 trick of avoiding the cp_abort on context switch if
> the process had not mapped foreign memory does not work on POWER10. Do the
> cp_abort unconditionally there.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/64s: handle ISA v3.1 local copy-paste context switches
  https://git.kernel.org/powerpc/c/dc462267d2d7aacffc3c1d99b02d7a7c59db7c66

cheers


Re: [PATCH] powerpc/pseries/eeh: Fix dumb linebreaks

2020-09-09 Thread Michael Ellerman
On Tue, 18 Aug 2020 14:45:57 +1000, Oliver O'Halloran wrote:
> These annoy me every time I see them. Why are they here? They're not even
> needed for 80cols compliance.

Applied to powerpc/next.

[1/1] powerpc/pseries/eeh: Fix dumb linebreaks
  https://git.kernel.org/powerpc/c/10bf59d923c2766ec8d6f0243481c865c7db9979

cheers


Re: [PATCH v3] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-09-09 Thread Michael Ellerman
On Thu, 13 Aug 2020 10:11:31 -0500, Nathan Lynch wrote:
> The drmem lmb list can have hundreds of thousands of entries, and
> unfortunately lookups take the form of linear searches. As long as
> this is the case, traversals have the potential to monopolize the CPU
> and provoke lockup reports, workqueue stalls, and the like unless
> they explicitly yield.
> 
> Rather than placing cond_resched() calls within various
> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
> expression of the loop macro itself so users can't omit it.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
  https://git.kernel.org/powerpc/c/9d6792ffe140240ae54c881cc4183f9acc24b4df

cheers


Re: [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-09-09 Thread Michael Ellerman
On Tue, 28 Jul 2020 12:37:41 -0500, Nathan Lynch wrote:
> The drmem lmb list can have hundreds of thousands of entries, and
> unfortunately lookups take the form of linear searches. As long as
> this is the case, traversals have the potential to monopolize the CPU
> and provoke lockup reports, workqueue stalls, and the like unless
> they explicitly yield.
> 
> Rather than placing cond_resched() calls within various
> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
> expression of the loop macro itself so users can't omit it.

Applied to powerpc/next.

[1/1] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
  https://git.kernel.org/powerpc/c/9d6792ffe140240ae54c881cc4183f9acc24b4df

cheers


Re: [PATCH] powerpc/64: Remove unused generic_secondary_thread_init()

2020-09-09 Thread Michael Ellerman
On Wed, 19 Aug 2020 11:57:04 +1000, Michael Ellerman wrote:
> The last caller was removed in 2014 in commit fb5a515704d7 ("powerpc:
> Remove platforms/wsp and associated pieces").
> 
> As Jordan noticed even though there are no callers, the code above in
> fsl_secondary_thread_init() falls through into
> generic_secondary_thread_init(). So we can remove the _GLOBAL but not
> the body of the function.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/64: Remove unused generic_secondary_thread_init()
  https://git.kernel.org/powerpc/c/529d2bd56ada4b8a4904909042792879868208cd

cheers


Re: [PATCH 1/9] selftests/powerpc: Make using_hash_mmu() work on Cell & PowerMac

2020-09-09 Thread Michael Ellerman
On Wed, 19 Aug 2020 11:57:19 +1000, Michael Ellerman wrote:
> These platforms don't show the MMU in /proc/cpuinfo, but they always
> use hash, so teach using_hash_mmu() that.

Applied to powerpc/next.

[1/9] selftests/powerpc: Make using_hash_mmu() work on Cell & PowerMac
  https://git.kernel.org/powerpc/c/34c103342be3f9397e656da7c5cc86e97b91f514
[2/9] selftests/powerpc: Give the bad_accesses test longer to run
  https://git.kernel.org/powerpc/c/17c98a541dc9bb1162877af41cddbdca043f9a59
[3/9] selftests/powerpc: Move set_dscr() into rfi_flush.c
  https://git.kernel.org/powerpc/c/d89002397cfb2b65267d6688fe671ee1cf7c5f0d
[4/9] selftests/powerpc: Include asm/cputable.h from utils.h
  https://git.kernel.org/powerpc/c/178282a054dced1a08a9683d41ac08cbace2b2fe
[5/9] selftests/powerpc: Don't run DSCR tests on old systems
  https://git.kernel.org/powerpc/c/4c3c3c502575556c4bc1b401235e641863b1bce6
[6/9] selftests/powerpc: Skip security tests on older CPUs
  https://git.kernel.org/powerpc/c/3a31518a242dcb262b008d3bb5d4b1cf50cf4026
[7/9] selftests/powerpc: Skip L3 bank test on older CPUs
  https://git.kernel.org/powerpc/c/4871a10b7b5f6b0632bff229884dad1cb1e8dc37
[8/9] selftests/powerpc: Don't touch VMX/VSX on older CPUs
  https://git.kernel.org/powerpc/c/09275d717d1b2d7d5ed91f2140bb34246514a1b4
[9/9] selftests/powerpc: Properly handle failure in switch_endian_test
  https://git.kernel.org/powerpc/c/003d6f5fd2cc3b529f3e6c529bc4bb0792930212

cheers


Re: [PATCH 1/3] selftests/powerpc: Fix TM tests when CPU 0 is offline

2020-09-09 Thread Michael Ellerman
On Thu, 13 Aug 2020 11:34:43 +1000, Michael Ellerman wrote:
> Several of the TM tests fail spuriously if CPU 0 is offline, because
> they blindly try to affinitise to CPU 0.
> 
> Fix them by picking any online CPU and using that instead.

Applied to powerpc/next.

[1/3] selftests/powerpc: Fix TM tests when CPU 0 is offline
  https://git.kernel.org/powerpc/c/c0176429b7b07893a5c1fd38baff055c919ba9e3
[2/3] selftests/powerpc: Don't use setaffinity in tm-tmspr
  https://git.kernel.org/powerpc/c/769628710c33b18ede837bb488e1d24084b35592
[3/3] selftests/powerpc: Run tm-tmspr test for longer
  https://git.kernel.org/powerpc/c/b5a646a681f5d67ea5190a71d6e84a91efe63b7a

cheers


Re: [PATCH v5 0/4] Allow bigger 64bit window by removing default DMA window

2020-09-09 Thread Michael Ellerman
On Wed, 5 Aug 2020 00:04:51 -0300, Leonardo Bras wrote:
> There are some devices in which a hypervisor may only allow 1 DMA window
> to exist at a time, and in those cases, a DDW is never created to them,
> since the default DMA window keeps using this resource.
> 
> LoPAR recommends this procedure:
> 1. Remove the default DMA window,
> 2. Query for which configs the DDW can be created,
> 3. Create a DDW.
> 
> [...]

Applied to powerpc/next.

[1/4] powerpc/pseries/iommu: Create defines for operations in ibm, 
ddw-applicable
  https://git.kernel.org/powerpc/c/cac3e629086f1b2e31c87a6c9b0130d29843ae86
[2/4] powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
  https://git.kernel.org/powerpc/c/80f0251231131d164eddab78d2b6c1b8e37d0093
[3/4] powerpc/pseries/iommu: Move window-removing part of remove_ddw into 
remove_dma_window
  https://git.kernel.org/powerpc/c/74d0b3994e147a2b503170b5e02f1d07dc086586
[4/4] powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA 
window
  https://git.kernel.org/powerpc/c/8c0d51592f6f0123953633d1ecf21e843fce0bfd

cheers


Re: [PATCH v3] powerpc: Warn about use of smt_snooze_delay

2020-09-09 Thread Michael Ellerman
On Wed, 2 Sep 2020 09:30:11 +0930, Joel Stanley wrote:
> It's not done anything for a long time. Save the percpu variable, and
> emit a warning to remind users to not expect it to do anything.
> 
> This uses pr_warn_once instead of pr_warn_ratelimit as testing
> 'ppc64_cpu --smt=off' on a 24 core / 4 SMT system showed the warning to
> be noisy, as the online/offline loop is slow.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc: Warn about use of smt_snooze_delay
  https://git.kernel.org/powerpc/c/a02f6d42357acf6e5de6ffc728e6e77faf3ad217

cheers


Re: [PATCH v2] powerpc: Warn about use of smt_snooze_delay

2020-09-09 Thread Michael Ellerman
On Tue, 30 Jun 2020 11:29:35 +0930, Joel Stanley wrote:
> It's not done anything for a long time. Save the percpu variable, and
> emit a warning to remind users to not expect it to do anything.
> 
> Fixes: 3fa8cad82b94 ("powerpc/pseries/cpuidle: smt-snooze-delay cleanup.")
> Cc: sta...@vger.kernel.org # v3.14
> Signed-off-by: Joel Stanley 
> --
> v2:
>  Use pr_warn instead of WARN
>  Reword and print proccess name with pid in message
>  Leave CPU_FTR_SMT test in
>  Add Fixes line
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc: Warn about use of smt_snooze_delay
  https://git.kernel.org/powerpc/c/a02f6d42357acf6e5de6ffc728e6e77faf3ad217

cheers


Re: [PATCH] powerpc/powernv: Print helpful message when cores guarded

2020-09-09 Thread Michael Ellerman
On Thu, 1 Aug 2019 14:46:30 +0930, Joel Stanley wrote:
> Often the firmware will guard out cores after a crash. This often
> undesirable, and is not immediately noticeable.
> 
> This adds an informative message when a CPU device tree nodes are marked
> bad in the device tree.

Applied to powerpc/next.

[1/1] powerpc/powernv: Print helpful message when cores guarded
  https://git.kernel.org/powerpc/c/8f55984f530d7275531e17f36ea29229c2c410dd

cheers


Re: [PATCH v2] powerpc: Update documentation of ISA versions for Power10

2020-09-09 Thread Michael Ellerman
On Thu, 27 Aug 2020 14:05:56 +1000, Jordan Niethe wrote:
> Update the CPU to ISA Version Mapping document to include Power10 and
> ISA v3.1.

Applied to powerpc/next.

[1/1] powerpc: Update documentation of ISA versions for Power10
  https://git.kernel.org/powerpc/c/51a1588154cb1ddc4fe8fa786324dca398f1a458

cheers


Re: [PATCH] selftests/powerpc: Fix prefixes in alignment_handler signal handler

2020-09-09 Thread Michael Ellerman
On Mon, 24 Aug 2020 23:12:31 +1000, Jordan Niethe wrote:
> The signal handler in the alignment handler self test has the ability to
> jump over the instruction that triggered the signal. It does this by
> incrementing the PT_NIP in the user context by 4. If it were a prefixed
> instruction this will mean that the suffix is then executed which is
> incorrect. Instead check if the major opcode indicates a prefixed
> instruction (e.g. it is 1) and if so increment PT_NIP by 8.
> 
> [...]

Applied to powerpc/next.

[1/1] selftests/powerpc: Fix prefixes in alignment_handler signal handler
  https://git.kernel.org/powerpc/c/db96221a683342fd4775fd820a4d5376cd2f2ed0

cheers


Re: [PATCH] powerpc/boot: Update Makefile comment for 64bit wrapper

2020-09-09 Thread Michael Ellerman
On Tue, 25 Aug 2020 13:51:47 +1000, Jordan Niethe wrote:
> As of commit 147c05168fc8 ("powerpc/boot: Add support for 64bit little
> endian wrapper") the comment in the Makefile is misleading. The wrapper
> packaging 64bit kernel may built as a 32 or 64 bit elf. Update the
> comment to reflect this.

Applied to powerpc/next.

[1/1] powerpc/boot: Update Makefile comment for 64bit wrapper
  https://git.kernel.org/powerpc/c/364b236a0b6e86439b9025d961da8602db23d5bf

cheers


Re: [PATCH v2] powerpc: Remove flush_instruction_cache() on 8xx

2020-09-09 Thread Michael Ellerman
On Fri, 14 Aug 2020 05:49:29 + (UTC), Christophe Leroy wrote:
> flush_instruction_cache() is never used on 8xx, remove it.

Applied to powerpc/next.

[1/1] powerpc: Remove flush_instruction_cache() on 8xx
  https://git.kernel.org/powerpc/c/76d46a1e2fe2c35f24c07b7cc8a41afbf98b349e

cheers


Re: [PATCH][V2] macintosh: windfarm: remove detatch debug containing spelling mistakes

2020-09-09 Thread Michael Ellerman
On Thu, 6 Aug 2020 11:29:01 +0100, Colin King wrote:
> There are spelling mistakes in two debug messages. As recommended
> by Wolfram Sang, these can be removed as there is plenty of debug
> in the driver core.

Applied to powerpc/next.

[1/1] macintosh: windfarm: remove detatch debug containing spelling mistakes
  https://git.kernel.org/powerpc/c/7db0a07273e8f581d0b3e8a102d3d9dd99f43528

cheers


Re: [PATCH v3 1/2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()

2020-09-09 Thread Michael Ellerman
On Wed, 12 Aug 2020 12:25:16 + (UTC), Christophe Leroy wrote:
> At the time being, __put_user()/__get_user() and friends only use
> D-form addressing, with 0 offset. Ex:
> 
>   lwz reg1, 0(reg2)
> 
> Give the compiler the opportunity to use other adressing modes
> whenever possible, to get more optimised code.
> 
> [...]

Applied to powerpc/next.

[1/2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()
  https://git.kernel.org/powerpc/c/c20beffeec3cb6f6f52d9aef27f91a3f453a91f4
[2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and 
__put_user_asm()
  https://git.kernel.org/powerpc/c/2f279eeb68b8eda43a95255db701b4faaeedbe0f

cheers


Re: [PATCH v2] powerpc: Drop _nmask_and_or_msr()

2020-09-09 Thread Michael Ellerman
On Fri, 14 Aug 2020 06:54:49 + (UTC), Christophe Leroy wrote:
> _nmask_and_or_msr() is only used at two places to set MSR_IP.
> 
> The SYNC is unnecessary as the users are not PowerPC 601.
> 
> Can be easily writen in C.
> 
> Do it, and drop _nmask_and_or_msr()

Applied to powerpc/next.

[1/1] powerpc: Drop _nmask_and_or_msr()
  https://git.kernel.org/powerpc/c/e53281bc21f061f96c9004f534bc3e807d70cb73

cheers


Re: [PATCH v2 1/4] powerpc: Remove flush_instruction_cache for book3s/32

2020-09-09 Thread Michael Ellerman
On Fri, 14 Aug 2020 05:56:24 + (UTC), Christophe Leroy wrote:
> The only callers of flush_instruction_cache() are:
> 
> arch/powerpc/kernel/swsusp_booke.S:   bl flush_instruction_cache
> arch/powerpc/mm/nohash/40x.c: flush_instruction_cache();
> arch/powerpc/mm/nohash/44x.c: flush_instruction_cache();
> arch/powerpc/mm/nohash/fsl_booke.c:   flush_instruction_cache();
> arch/powerpc/platforms/44x/machine_check.c:   
> flush_instruction_cache();
> arch/powerpc/platforms/44x/machine_check.c:   
> flush_instruction_cache();
> 
> [...]

Applied to powerpc/next.

[1/4] powerpc: Remove flush_instruction_cache for book3s/32
  https://git.kernel.org/powerpc/c/e426ab39f41045a4c163031272b2f48d944b69c0
[2/4] powerpc: Move flush_instruction_cache() prototype in asm/cacheflush.h
  https://git.kernel.org/powerpc/c/f663f3312051402d32952c44d156a20c0b854753
[3/4] powerpc: Rewrite 4xx flush_cache_instruction() in C
  https://git.kernel.org/powerpc/c/de39b19452e784de5f90ae899851ab29a29bb42c
[4/4] powerpc: Rewrite FSL_BOOKE flush_cache_instruction() in C
  https://git.kernel.org/powerpc/c/704dfe931df951895dea98bd1d9cacbb601b6451

cheers


Re: [PATCH v1] powerpc/process: Remove unnecessary #ifdef CONFIG_FUNCTION_GRAPH_TRACER

2020-09-09 Thread Michael Ellerman
On Mon, 17 Aug 2020 05:46:39 + (UTC), Christophe Leroy wrote:
> ftrace_graph_ret_addr() is always defined and returns 'ip' when
> CONFIG_FUNCTION GRAPH_TRACER is not set.
> 
> So the #ifdef is not needed, remove it.

Applied to powerpc/next.

[1/1] powerpc/process: Remove unnecessary #ifdef CONFIG_FUNCTION_GRAPH_TRACER
  https://git.kernel.org/powerpc/c/353bce211e00d183344f464ba1ee0e1ffb0e2a6c

cheers


Re: [PATCH] powerpc/irq: Drop forward declaration of struct irqaction

2020-09-09 Thread Michael Ellerman
On Thu, 6 Aug 2020 12:19:46 + (UTC), Christophe Leroy wrote:
> Since the commit identified below, the forward declaration of
> struct irqaction is useless. Drop it.

Applied to powerpc/next.

[1/1] powerpc/irq: Drop forward declaration of struct irqaction
  https://git.kernel.org/powerpc/c/b134cfc3e3276ccd5d29e39de5c848a45b08e410

cheers


Re: [PATCH] powerpc/32s: Fix assembler warning about r0

2020-09-09 Thread Michael Ellerman
On Thu, 6 Aug 2020 06:01:42 + (UTC), Christophe Leroy wrote:
> The assembler says:
>   arch/powerpc/kernel/head_32.S:1095: Warning: invalid register expression
> 
> It's objecting to the use of r0 as the RA argument. That's because
> when RA = 0 the literal value 0 is used, rather than the content of
> r0, making the use of r0 in the source potentially confusing.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/32s: Fix assembler warning about r0
  https://git.kernel.org/powerpc/c/b51ba4fe2e134b631f9c8f45423707aab71449b5

cheers


Re: [PATCH] powerpc/hwirq: Remove stale forward irq_chip declaration

2020-09-09 Thread Michael Ellerman
On Thu, 6 Aug 2020 12:19:06 + (UTC), Christophe Leroy wrote:
> Since commit identified below, the forward declaration of
> struct irq_chip is useless (was struct hw_interrupt_type at that time)
> 
> Remove it, together with the associated comment.

Applied to powerpc/next.

[1/1] powerpc/hwirq: Remove stale forward irq_chip declaration
  https://git.kernel.org/powerpc/c/169b9afee572853522901b7cbf34842c0494a887

cheers


Re: [PATCH 1/2] powerpc/fpu: Drop cvt_fd() and cvt_df()

2020-09-09 Thread Michael Ellerman
On Thu, 6 Aug 2020 12:20:34 + (UTC), Christophe Leroy wrote:
> Those two functions have been unused since commit identified below.
> Drop them.

Applied to powerpc/next.

[1/2] powerpc/fpu: Drop cvt_fd() and cvt_df()
  https://git.kernel.org/powerpc/c/63442de4301188129e1fcff144fbfb966ad5eb19
[2/2] powerpc: drop hard_reset_now() and poweroff_now() declaration
  https://git.kernel.org/powerpc/c/82eb1792426f8a171cdaa6cfccb63c39f55bc9bd

cheers


Re: [PATCH v2 1/2] powerpc/perf: consolidate GPCI hcall structs into asm/hvcall.h

2020-09-09 Thread Michael Ellerman
On Mon, 27 Jul 2020 13:46:04 -0500, Scott Cheloha wrote:
> The H_GetPerformanceCounterInfo (GPCI) hypercall input/output structs are
> useful to modules outside of perf/, so move them into asm/hvcall.h to live
> alongside the other powerpc hypercall structs.
> 
> Leave the perf-specific GPCI stuff in perf/hv-gpci.h.

Applied to powerpc/next.

[1/2] powerpc/perf: consolidate GPCI hcall structs into asm/hvcall.h
  https://git.kernel.org/powerpc/c/59562b5c33d6ff3685509ed58b2ed3c5b5712704
[2/2] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
  https://git.kernel.org/powerpc/c/5d1bc776428f34941a6237afb9454061b5b5e1e1

cheers


Re: [PATCH v3] pseries/drmem: don't cache node id in drmem_lmb struct

2020-09-09 Thread Michael Ellerman
On Mon, 10 Aug 2020 20:51:15 -0500, Scott Cheloha wrote:
> At memory hot-remove time we can retrieve an LMB's nid from its
> corresponding memory_block.  There is no need to store the nid
> in multiple locations.
> 
> Note that lmb_to_memblock() uses find_memory_block() to get the
> corresponding memory_block.  As find_memory_block() runs in sub-linear
> time this approach is negligibly slower than what we do at present.
> 
> [...]

Applied to powerpc/next.

[1/1] pseries/drmem: don't cache node id in drmem_lmb struct
  https://git.kernel.org/powerpc/c/e5e179aa3a39c818db8fbc2dce8d2cd24adaf657

cheers


Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-09 Thread Gerald Schaefer
On Tue, 8 Sep 2020 07:30:50 -0700
Dave Hansen  wrote:

> On 9/7/20 11:00 AM, Gerald Schaefer wrote:
> > Commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast
> > code") introduced a subtle but severe bug on s390 with gup_fast, due to
> > dynamic page table folding.
> 
> Would it be fair to say that the "fake" page table entries s390
> allocates on the stack are what's causing the trouble here?  That might
> be a nice thing to open up with here.  "Dynamic page table folding"
> really means nothing to me.

Sorry, I guess my previous reply does not really explain "what the heck
is dynamic page table folding?".

On s390, we can have different number of page table levels for different
processes / mms. We always start with 3 levels, and update dynamically
on process demand to 4 or 5 levels, hence the dynamic folding. Still,
the PxD_SIZE/SHIFT is defined statically, so that e.g. pXd_addr_end() will
not reflect this dynamic behavior.

For the various pagetable walkers using pXd_addr_end() (w/o READ_ONCE
logic) this is no problem. With static folding, iteration over the folded
levels will always happen at pgd level (top-level folding). For s390,
we stay at the respective level and iterate there (dynamic middle-level
folding), only return to pgd level if there really were 5 levels.

This only works well as long there are real pagetable pointers involved,
that can also be used for iteration. For gup_fast, or any other future
pagetable walkers using the READ_ONCE logic w/o lock, that is not true.
There are pointers involved to local pXd values on the stack, because of
the READ_ONCE logic, and our middle-level iteration will suddenly iterate
over such stack pointers instead of pagetable pointers.

This will be addressed by making the pXd_addr_end() dynamic, for which
we need to see the pXd value in order to determine its level / type.


Re: [PATCH v2 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-09 Thread David Hildenbrand
On 09.09.20 13:37, David Hildenbrand wrote:
> On 09.09.20 13:24, Michael Ellerman wrote:
>> David Hildenbrand  writes:
>>> On 09.09.20 09:17, Greg Kroah-Hartman wrote:
 On Tue, Sep 08, 2020 at 10:10:08PM +0200, David Hildenbrand wrote:
> We soon want to pass flags, e.g., to mark added System RAM resources.
> mergeable. Prepare for that.

 What are these random "flags", and how do we know what should be passed
 to them?

 Why not make this an enumerated type so that we know it all works
 properly, like the GPF_* flags are?  Passing around a random unsigned
 long feels very odd/broken...
>>>
>>> Agreed, an enum (mhp_flags) seems to give a better hint what can
>>> actually be passed. Thanks!
>>
>> You probably know this but ...
>>
>> Just using a C enum doesn't get you any type safety.
>>
>> You can get some checking via sparse by using __bitwise, which is what
>> gfp_t does. You don't actually have to use an enum for that, it works
>> with #defines also.
> 
> Yeah, we seem to be using different approaches. And there is always a
> way to mess things up :)
> 
> gfp_t is one (extreme) example, enum memblock_flags is another example.
> I tend to prefer an enum in this particular case, because it's simple
> and at least tells the user which values are expected.
> 

Gave it another try, looks like mhp_t (like gfp_t) is actually nicer.

-- 
Thanks,

David / dhildenb



Re: [PATCH v2 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-09 Thread David Hildenbrand
On 09.09.20 13:24, Michael Ellerman wrote:
> David Hildenbrand  writes:
>> On 09.09.20 09:17, Greg Kroah-Hartman wrote:
>>> On Tue, Sep 08, 2020 at 10:10:08PM +0200, David Hildenbrand wrote:
 We soon want to pass flags, e.g., to mark added System RAM resources.
 mergeable. Prepare for that.
>>>
>>> What are these random "flags", and how do we know what should be passed
>>> to them?
>>>
>>> Why not make this an enumerated type so that we know it all works
>>> properly, like the GPF_* flags are?  Passing around a random unsigned
>>> long feels very odd/broken...
>>
>> Agreed, an enum (mhp_flags) seems to give a better hint what can
>> actually be passed. Thanks!
> 
> You probably know this but ...
> 
> Just using a C enum doesn't get you any type safety.
> 
> You can get some checking via sparse by using __bitwise, which is what
> gfp_t does. You don't actually have to use an enum for that, it works
> with #defines also.

Yeah, we seem to be using different approaches. And there is always a
way to mess things up :)

gfp_t is one (extreme) example, enum memblock_flags is another example.
I tend to prefer an enum in this particular case, because it's simple
and at least tells the user which values are expected.

Thoughts?

> 
> Or you can wrap the flag in a struct, the way atomic_t does, and then
> the compiler will prevent passing plain integers in place of your custom
> type.



-- 
Thanks,

David / dhildenb



Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Gerald Schaefer
On Wed, 9 Sep 2020 13:38:25 +0530
Anshuman Khandual  wrote:

> 
> 
> On 09/04/2020 08:56 PM, Gerald Schaefer wrote:
> > On Fri, 4 Sep 2020 12:18:05 +0530
> > Anshuman Khandual  wrote:
> > 
> >>
> >>
> >> On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> >>> This patch series includes fixes for debug_vm_pgtable test code so that
> >>> they follow page table updates rules correctly. The first two patches 
> >>> introduce
> >>> changes w.r.t ppc64. The patches are included in this series for 
> >>> completeness. We can
> >>> merge them via ppc64 tree if required.
> >>>
> >>> Hugetlb test is disabled on ppc64 because that needs larger change to 
> >>> satisfy
> >>> page table update rules.
> >>>
> >>> These tests are broken w.r.t page table update rules and results in kernel
> >>> crash as below. 
> >>>
> >>> [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
> >>> cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0]
> >>> pc: c009a5ec: assert_pte_locked+0x14c/0x380
> >>> lr: c05c: pte_update+0x11c/0x190
> >>> sp: c00c6d1e7950
> >>>msr: 82029033
> >>>   current = 0xc00c6d172c80
> >>>   paca= 0xc3ba   irqmask: 0x03   irq_happened: 0x01
> >>> pid   = 1, comm = swapper/0
> >>> kernel BUG at arch/powerpc/mm/pgtable.c:304!
> >>> [link register   ] c05c pte_update+0x11c/0x190
> >>> [c00c6d1e7950] 0001 (unreliable)
> >>> [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190
> >>> [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8
> >>> [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338
> >>> [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0
> >>> [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4
> >>> [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160
> >>> [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
> >>>
> >>> With DEBUG_VM disabled
> >>>
> >>> [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x
> >>> [   20.530183] Faulting instruction address: 0xc00df330
> >>> cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700]
> >>> pc: c00df330: memset+0x68/0x104
> >>> lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> >>> sp: c00c6d19f990
> >>>msr: 82009033
> >>>dar: 0
> >>>   current = 0xc00c6d177480
> >>>   paca= 0xc0001ec4f400   irqmask: 0x03   irq_happened: 0x01
> >>> pid   = 1, comm = swapper/0
> >>> [link register   ] c009f6d8 
> >>> hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> >>> [c00c6d19f990] c009f748 
> >>> hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
> >>> [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378
> >>> [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244
> >>> [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0
> >>> [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4
> >>> [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160
> >>> [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
> >>>
> >>> Changes from v3:
> >>> * Address review feedback
> >>> * Move page table depost and withdraw patch after adding pmdlock to avoid 
> >>> bisect failure.
> >>
> >> This version
> >>
> >> - Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with 
> >> DEBUG_VM_PGTABLE)
> >> - Runs on arm64 and x86 without any regression, atleast nothing that I 
> >> have noticed
> >> - Will be great if this could get tested on s390, arc, riscv, ppc32 
> >> platforms as well
> > 
> > When I quickly tested v3, it worked fine, but now it turned out to
> > only work fine "sometimes", both v3 and v4. I need to look into it
> > further, but so far it seems related to the hugetlb_advanced_tests().
> > 
> > I guess there was already some discussion on this test, but we did
> > not receive all of the thread(s). Please always add at least
> > linux-s...@vger.kernel.org and maybe myself and Vasily Gorbik 
> > 
> > for further discussions.
> 
> IIRC, the V3 series previously had all these addresses copied properly
> but this version once again missed copying all required addresses.

I also had issues with the de.ibm.com address, which might also have
made some mails disappear, and others might simply have been overlooked
be me. Don't bother, my bad.

> 
> > 
> > That being said, sorry for duplications, this might already have been
> > discussed. Preliminary analysis showed that it only seems to go wrong
> > for certain random vaddr values. I cannot make any sense of that yet,
> > but what seems strange to me is that the hugetlb_advanced_tests()
> > take a (real) pte_t pointer as input, and also use that for all
> > kinds of operations (set_huge_pte_at, huge_ptep_get_and_clear, etc.).
> > 
> > Although all the hugetlb code in the kernel is (mis)using pte_t
> > pointers instead of the correct pmd/pud_t pointers 

Re: [PATCH v2 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-09 Thread Michael Ellerman
David Hildenbrand  writes:
> On 09.09.20 09:17, Greg Kroah-Hartman wrote:
>> On Tue, Sep 08, 2020 at 10:10:08PM +0200, David Hildenbrand wrote:
>>> We soon want to pass flags, e.g., to mark added System RAM resources.
>>> mergeable. Prepare for that.
>> 
>> What are these random "flags", and how do we know what should be passed
>> to them?
>> 
>> Why not make this an enumerated type so that we know it all works
>> properly, like the GPF_* flags are?  Passing around a random unsigned
>> long feels very odd/broken...
>
> Agreed, an enum (mhp_flags) seems to give a better hint what can
> actually be passed. Thanks!

You probably know this but ...

Just using a C enum doesn't get you any type safety.

You can get some checking via sparse by using __bitwise, which is what
gfp_t does. You don't actually have to use an enum for that, it works
with #defines also.

Or you can wrap the flag in a struct, the way atomic_t does, and then
the compiler will prevent passing plain integers in place of your custom
type.

cheers


Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Gerald Schaefer
On Wed, 09 Sep 2020 11:38:39 +0530
"Aneesh Kumar K.V"  wrote:

> Gerald Schaefer  writes:
> 
> > On Fri, 4 Sep 2020 18:01:15 +0200
> > Gerald Schaefer  wrote:
> >
> > [...]
> >> 
> >> BTW2, a quick test with this change (so far) made the issues on s390
> >> go away:
> >> 
> >> @@ -1069,7 +1074,7 @@ static int __init debug_vm_pgtable(void)
> >> spin_unlock(ptl);
> >> 
> >>  #ifndef CONFIG_PPC_BOOK3S_64
> >> -   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> >> +   hugetlb_advanced_tests(mm, vma, (pte_t *) pmdp, pmd_aligned, 
> >> vaddr, prot);
> >>  #endif
> >> 
> >> spin_lock(>page_table_lock);
> >> 
> >> That would more match the "pte_t pointer" usage for hugetlb code,
> >> i.e. just cast a pmd_t pointer to it. Also changed to pmd_aligned,
> >> but I think the root cause is the pte_t pointer.
> >> 
> >> Not entirely sure though if that would really be the correct fix.
> >> I somehow lost whatever little track I had about what these tests
> >> really want to check, and if that would still be valid with that
> >> change.
> >
> > Uh oh, wasn't aware that this (or some predecessor) already went
> > upstream, and broke our debug kernel today.
> 
> Not sure i followed the above. Are you finding that s390 kernel crash
> after this patch series or the original patchset? As noted in my patch
> the hugetlb test is broken and we should fix that. A quick fix is to
> comment out that test for s390 too as i have done for PPC64.

We see it with both, it basically is broken since there is a hugetlb
test using real pte pointers. It doesn't always show, depending on
random vaddr, so it slipped through earlier testing.

I guess we also would have had one or the other chance to notice
that earlier, through better review, or better reading of previous
mails. I must admit that I neglected this a bit.


Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Gerald Schaefer
On Wed, 9 Sep 2020 13:45:48 +0530
Anshuman Khandual  wrote:

[...]
> > 
> > That would more match the "pte_t pointer" usage for hugetlb code,
> > i.e. just cast a pmd_t pointer to it. Also changed to pmd_aligned,
> > but I think the root cause is the pte_t pointer.
> 
> Ideally, the pte_t pointer used here should be from huge_pte_alloc()
> not from pte_alloc_map_lock() as the case currently.

Ah, good point. I assumed that this would also always return casted
pmd etc. pointers, and never pte pointers. Unfortunately, that doesn't
seem to be true for all architectures, e.g. ia64, parisc, (some) powerpc,
where they really do a pte_alloc_map() for some reason.

I guess that means you cannot simply cast the pmd pointer, as suggested,
although I really do not understand how any architecture can work with
real ptes for hugepages. But that's fair, s390 also does some things that
nobody would expect or understand for other architectures...

So, for using huge_pte_alloc() you'd also need some size, maybe
iterating over hstates with for_each_hstate() could be an option,
if they are already initialized at that point. Then you have the
size(s) with huge_page_size(hstate) and can actually call the
hugetlb tests for all supported sizes, and with proper pointer
from huge_pte_alloc().


Re: Flushing transparent hugepages

2020-09-09 Thread Aneesh Kumar K.V
Matthew Wilcox  writes:

> PowerPC has special handling of hugetlbfs pages.  Well, that's what
> the config option says, but actually it handles THP as well.  If
> the config option is enabled.
>
> #ifdef CONFIG_HUGETLB_PAGE
> if (PageCompound(page)) {
> flush_dcache_icache_hugepage(page);
> return;
> }
> #endif

I do have a change posted sometime back to avoid that confusion.
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200320103256.229365-1-aneesh.ku...@linux.ibm.com/

But IIUC we use the head page flags (PG_arch_1) to track whether we need
the flush or not.

>
> By the way, THPs can be mapped askew -- that is, at an offset which
> means you can't use a PMD to map a PMD sized page.
>
> Anyway, we don't really have consensus between the various architectures
> on how to handle either THPs or hugetlb pages.  It's not contemplated
> in Documentation/core-api/cachetlb.rst so there's no real surprise
> we've diverged.
>
> What would you _like_ to see?  Would you rather flush_dcache_page()
> were called once for each subpage, or would you rather maintain
> the page-needs-flushing state once per compound page?  We could also
> introduce flush_dcache_thp() if some architectures would prefer it one
> way and one the other, although that brings into question what to do
> for hugetlbfs pages.
>
> It might not be a bad idea to centralise the handling of all this stuff
> somewhere.  Sounds like the kind of thing Arnd would like to do ;-) I'll
> settle for getting enough clear feedback about what the various arch
> maintainers want that I can write a documentation update for cachetlb.rst.


Re: [PATCH kernel] powerpc/dma: Fix dma_map_ops::get_required_mask

2020-09-09 Thread Alexey Kardashevskiy



On 09/09/2020 17:58, Christoph Hellwig wrote:
> On Tue, Sep 08, 2020 at 11:10:03PM +1000, Alexey Kardashevskiy wrote:
 a-ha, this makes more sense, thanks. Then I guess we need to revert that
 one bit from yours f1565c24b596, do not we?
>>>
>>> Why?  The was the original intent of the API, but now we also use
>>> internally to check the addressing capabilities.
>>
>> The bigger mask the better, no? As it is now, it's limited by the window 
>> size which happens to be bigger than 4GB but smaller then full 64bit (48bit 
>> on my system)
> 
> Yes, the bigger mask is better.  But I don't see why you'd want to
> revert the dma bypass code for that entirely.
> 

I want dma_get_required_mask() to return the bigger mask always.

Now it depends on (in dma_alloc_direct()):
1. dev->dma_ops_bypass: set via pci_set_(coherent_)dma_mask();
2. dev->coherent_dma_mask - the same;
3. dev->bus_dma_limit - usually not set at all.

So until we set the mask, dma_get_required_mask() returns smaller mask.
So aacraid and likes (which calls dma_get_required_mask() before setting
it) will remain prone for breaks.


[forgot to cc: other folks last time, fixed now]

-- 
Alexey


Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Anshuman Khandual
On 09/04/2020 11:23 PM, Gerald Schaefer wrote:
> On Fri, 4 Sep 2020 18:01:15 +0200
> Gerald Schaefer  wrote:
> 
>> On Fri, 4 Sep 2020 17:26:47 +0200
>> Gerald Schaefer  wrote:
>>
>>> On Fri, 4 Sep 2020 12:18:05 +0530
>>> Anshuman Khandual  wrote:
>>>


 On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> This patch series includes fixes for debug_vm_pgtable test code so that
> they follow page table updates rules correctly. The first two patches 
> introduce
> changes w.r.t ppc64. The patches are included in this series for 
> completeness. We can
> merge them via ppc64 tree if required.
>
> Hugetlb test is disabled on ppc64 because that needs larger change to 
> satisfy
> page table update rules.
>
> These tests are broken w.r.t page table update rules and results in kernel
> crash as below. 
>
> [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
> cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0]
> pc: c009a5ec: assert_pte_locked+0x14c/0x380
> lr: c05c: pte_update+0x11c/0x190
> sp: c00c6d1e7950
>msr: 82029033
>   current = 0xc00c6d172c80
>   paca= 0xc3ba   irqmask: 0x03   irq_happened: 0x01
> pid   = 1, comm = swapper/0
> kernel BUG at arch/powerpc/mm/pgtable.c:304!
> [link register   ] c05c pte_update+0x11c/0x190
> [c00c6d1e7950] 0001 (unreliable)
> [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190
> [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8
> [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338
> [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0
> [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4
> [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160
> [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
>
> With DEBUG_VM disabled
>
> [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x
> [   20.530183] Faulting instruction address: 0xc00df330
> cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700]
> pc: c00df330: memset+0x68/0x104
> lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> sp: c00c6d19f990
>msr: 82009033
>dar: 0
>   current = 0xc00c6d177480
>   paca= 0xc0001ec4f400   irqmask: 0x03   irq_happened: 0x01
> pid   = 1, comm = swapper/0
> [link register   ] c009f6d8 
> hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> [c00c6d19f990] c009f748 
> hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
> [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378
> [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244
> [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0
> [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4
> [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160
> [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
>
> Changes from v3:
> * Address review feedback
> * Move page table depost and withdraw patch after adding pmdlock to avoid 
> bisect failure.

 This version

 - Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with 
 DEBUG_VM_PGTABLE)
 - Runs on arm64 and x86 without any regression, atleast nothing that I 
 have noticed
 - Will be great if this could get tested on s390, arc, riscv, ppc32 
 platforms as well
>>>
>>> When I quickly tested v3, it worked fine, but now it turned out to
>>> only work fine "sometimes", both v3 and v4. I need to look into it
>>> further, but so far it seems related to the hugetlb_advanced_tests().
>>>
>>> I guess there was already some discussion on this test, but we did
>>> not receive all of the thread(s). Please always add at least
>>> linux-s...@vger.kernel.org and maybe myself and Vasily Gorbik 
>>> 
>>> for further discussions.
>>
>> BTW, with myself I mean the new address gerald.schae...@linux.ibm.com.
>> The old gerald.schae...@de.ibm.com seems to work (again), but is not
>> very reliable.
>>
>> BTW2, a quick test with this change (so far) made the issues on s390
>> go away:
>>
>> @@ -1069,7 +1074,7 @@ static int __init debug_vm_pgtable(void)
>> spin_unlock(ptl);
>>
>>  #ifndef CONFIG_PPC_BOOK3S_64
>> -   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>> +   hugetlb_advanced_tests(mm, vma, (pte_t *) pmdp, pmd_aligned, vaddr, 
>> prot);
>>  #endif
>>
>> spin_lock(>page_table_lock);
>>
>> That would more match the "pte_t pointer" usage for hugetlb code,
>> i.e. just cast a pmd_t pointer to it. Also changed to pmd_aligned,
>> but I think the root cause is the pte_t pointer.
>>
>> Not entirely sure 

Re: [RFC PATCH v2 2/3] mm: make pXd_addr_end() functions page-table entry aware

2020-09-09 Thread Christophe Leroy
On Tue, 2020-09-08 at 16:15 +0200, Alexander Gordeev wrote:
> On Tue, Sep 08, 2020 at 10:16:49AM +0200, Christophe Leroy wrote:
> > >Yes, and also two more sources :/
> > >   arch/powerpc/mm/kasan/8xx.c
> > >   arch/powerpc/mm/kasan/kasan_init_32.c
> > >
> > >But these two are not quite obvious wrt pgd_addr_end() used
> > >while traversing pmds. Could you please clarify a bit?
> > >
> > >
> > >diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
> > >index 2784224..89c5053 100644
> > >--- a/arch/powerpc/mm/kasan/8xx.c
> > >+++ b/arch/powerpc/mm/kasan/8xx.c
> > >@@ -15,8 +15,8 @@
> > >   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block 
> > > += SZ_8M) {
> > >   pte_basic_t *new;
> > >-  k_next = pgd_addr_end(k_cur, k_end);
> > >-  k_next = pgd_addr_end(k_next, k_end);
> > >+  k_next = pmd_addr_end(k_cur, k_end);
> > >+  k_next = pmd_addr_end(k_next, k_end);
> > 
> > No, I don't think so.
> > On powerpc32 we have only two levels, so pgd and pmd are more or
> > less the same.
> > But pmd_addr_end() as defined in include/asm-generic/pgtable-nopmd.h
> > is a no-op, so I don't think it will work.
> > 
> > It is likely that this function should iterate on pgd, then you get
> > pmd = pmd_offset(pud_offset(p4d_offset(pgd)));
> 
> It looks like the code iterates over single pmd table while using
> pgd_addr_end() only to skip all the middle levels and bail out
> from the loop.
> 
> I would be wary for switching from pmds to pgds, since we are
> trying to minimize impact (especially functional) and the
> rework does not seem that obvious.
> 

I've just tested the following change, it works and should fix the
oddity:

diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 2784224054f8..8e53ddf57b84 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -9,11 +9,12 @@
 static int __init
 kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void
*block)
 {
-   pmd_t *pmd = pmd_off_k(k_start);
+   pgd_t *pgd = pgd_offset_k(k_start);
unsigned long k_cur, k_next;
 
-   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block
+= SZ_8M) {
+   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pgd += 2, block
+= SZ_8M) {
pte_basic_t *new;
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, k_cur), 
k_cur),
k_cur);
 
k_next = pgd_addr_end(k_cur, k_end);
k_next = pgd_addr_end(k_next, k_end);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c
b/arch/powerpc/mm/kasan/kasan_init_32.c
index fb294046e00e..e5f524fa71a7 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -30,13 +30,12 @@ static void __init kasan_populate_pte(pte_t *ptep,
pgprot_t prot)
 
 int __init kasan_init_shadow_page_tables(unsigned long k_start,
unsigned long k_end)
 {
-   pmd_t *pmd;
+   pgd_t *pgd = pgd_offset_k(k_start);
unsigned long k_cur, k_next;
 
-   pmd = pmd_off_k(k_start);
-
-   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pgd++) {
pte_t *new;
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, k_cur), 
k_cur),
k_cur);
 
k_next = pgd_addr_end(k_cur, k_end);
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
@@ -189,16 +188,18 @@ void __init kasan_early_init(void)
unsigned long addr = KASAN_SHADOW_START;
unsigned long end = KASAN_SHADOW_END;
unsigned long next;
-   pmd_t *pmd = pmd_off_k(addr);
+   pgd_t *pgd = pgd_offset_k(addr);
 
BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
 
kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
 
do {
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, addr), addr),
addr);
+
next = pgd_addr_end(addr, end);
pmd_populate_kernel(_mm, pmd, kasan_early_shadow_pte);
-   } while (pmd++, addr = next, addr != end);
+   } while (pgd++, addr = next, addr != end);
 
if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
kasan_early_hash_table();
---
Christophe



Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Anshuman Khandual



On 09/04/2020 09:31 PM, Gerald Schaefer wrote:
> On Fri, 4 Sep 2020 17:26:47 +0200
> Gerald Schaefer  wrote:
> 
>> On Fri, 4 Sep 2020 12:18:05 +0530
>> Anshuman Khandual  wrote:
>>
>>>
>>>
>>> On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
 This patch series includes fixes for debug_vm_pgtable test code so that
 they follow page table updates rules correctly. The first two patches 
 introduce
 changes w.r.t ppc64. The patches are included in this series for 
 completeness. We can
 merge them via ppc64 tree if required.

 Hugetlb test is disabled on ppc64 because that needs larger change to 
 satisfy
 page table update rules.

 These tests are broken w.r.t page table update rules and results in kernel
 crash as below. 

 [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
 cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0]
 pc: c009a5ec: assert_pte_locked+0x14c/0x380
 lr: c05c: pte_update+0x11c/0x190
 sp: c00c6d1e7950
msr: 82029033
   current = 0xc00c6d172c80
   paca= 0xc3ba   irqmask: 0x03   irq_happened: 0x01
 pid   = 1, comm = swapper/0
 kernel BUG at arch/powerpc/mm/pgtable.c:304!
 [link register   ] c05c pte_update+0x11c/0x190
 [c00c6d1e7950] 0001 (unreliable)
 [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190
 [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8
 [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338
 [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0
 [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4
 [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160
 [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c

 With DEBUG_VM disabled

 [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x
 [   20.530183] Faulting instruction address: 0xc00df330
 cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700]
 pc: c00df330: memset+0x68/0x104
 lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
 sp: c00c6d19f990
msr: 82009033
dar: 0
   current = 0xc00c6d177480
   paca= 0xc0001ec4f400   irqmask: 0x03   irq_happened: 0x01
 pid   = 1, comm = swapper/0
 [link register   ] c009f6d8 
 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
 [c00c6d19f990] c009f748 
 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
 [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378
 [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244
 [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0
 [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4
 [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160
 [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c

 Changes from v3:
 * Address review feedback
 * Move page table depost and withdraw patch after adding pmdlock to avoid 
 bisect failure.
>>>
>>> This version
>>>
>>> - Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with 
>>> DEBUG_VM_PGTABLE)
>>> - Runs on arm64 and x86 without any regression, atleast nothing that I have 
>>> noticed
>>> - Will be great if this could get tested on s390, arc, riscv, ppc32 
>>> platforms as well
>>
>> When I quickly tested v3, it worked fine, but now it turned out to
>> only work fine "sometimes", both v3 and v4. I need to look into it
>> further, but so far it seems related to the hugetlb_advanced_tests().
>>
>> I guess there was already some discussion on this test, but we did
>> not receive all of the thread(s). Please always add at least
>> linux-s...@vger.kernel.org and maybe myself and Vasily Gorbik 
>> 
>> for further discussions.
> 
> BTW, with myself I mean the new address gerald.schae...@linux.ibm.com.
> The old gerald.schae...@de.ibm.com seems to work (again), but is not
> very reliable.

Sure, noted.

> 
> BTW2, a quick test with this change (so far) made the issues on s390
> go away:
> 
> @@ -1069,7 +1074,7 @@ static int __init debug_vm_pgtable(void)
> spin_unlock(ptl);
>  
>  #ifndef CONFIG_PPC_BOOK3S_64
> -   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> +   hugetlb_advanced_tests(mm, vma, (pte_t *) pmdp, pmd_aligned, vaddr, 
> prot);
>  #endif
>  
> spin_lock(>page_table_lock);
> 
> That would more match the "pte_t pointer" usage for hugetlb code,
> i.e. just cast a pmd_t pointer to it. Also changed to pmd_aligned,
> but I think the root cause is the pte_t pointer.

Ideally, the pte_t pointer used here should be from huge_pte_alloc()
not from pte_alloc_map_lock() as the case currently.

> 
> Not entirely sure though if that would really 

Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Anshuman Khandual



On 09/04/2020 08:56 PM, Gerald Schaefer wrote:
> On Fri, 4 Sep 2020 12:18:05 +0530
> Anshuman Khandual  wrote:
> 
>>
>>
>> On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
>>> This patch series includes fixes for debug_vm_pgtable test code so that
>>> they follow page table updates rules correctly. The first two patches 
>>> introduce
>>> changes w.r.t ppc64. The patches are included in this series for 
>>> completeness. We can
>>> merge them via ppc64 tree if required.
>>>
>>> Hugetlb test is disabled on ppc64 because that needs larger change to 
>>> satisfy
>>> page table update rules.
>>>
>>> These tests are broken w.r.t page table update rules and results in kernel
>>> crash as below. 
>>>
>>> [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
>>> cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0]
>>> pc: c009a5ec: assert_pte_locked+0x14c/0x380
>>> lr: c05c: pte_update+0x11c/0x190
>>> sp: c00c6d1e7950
>>>msr: 82029033
>>>   current = 0xc00c6d172c80
>>>   paca= 0xc3ba   irqmask: 0x03   irq_happened: 0x01
>>> pid   = 1, comm = swapper/0
>>> kernel BUG at arch/powerpc/mm/pgtable.c:304!
>>> [link register   ] c05c pte_update+0x11c/0x190
>>> [c00c6d1e7950] 0001 (unreliable)
>>> [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190
>>> [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8
>>> [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338
>>> [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0
>>> [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4
>>> [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160
>>> [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
>>>
>>> With DEBUG_VM disabled
>>>
>>> [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x
>>> [   20.530183] Faulting instruction address: 0xc00df330
>>> cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700]
>>> pc: c00df330: memset+0x68/0x104
>>> lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
>>> sp: c00c6d19f990
>>>msr: 82009033
>>>dar: 0
>>>   current = 0xc00c6d177480
>>>   paca= 0xc0001ec4f400   irqmask: 0x03   irq_happened: 0x01
>>> pid   = 1, comm = swapper/0
>>> [link register   ] c009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
>>> [c00c6d19f990] c009f748 
>>> hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
>>> [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378
>>> [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244
>>> [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0
>>> [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4
>>> [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160
>>> [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
>>>
>>> Changes from v3:
>>> * Address review feedback
>>> * Move page table depost and withdraw patch after adding pmdlock to avoid 
>>> bisect failure.
>>
>> This version
>>
>> - Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with 
>> DEBUG_VM_PGTABLE)
>> - Runs on arm64 and x86 without any regression, atleast nothing that I have 
>> noticed
>> - Will be great if this could get tested on s390, arc, riscv, ppc32 
>> platforms as well
> 
> When I quickly tested v3, it worked fine, but now it turned out to
> only work fine "sometimes", both v3 and v4. I need to look into it
> further, but so far it seems related to the hugetlb_advanced_tests().
> 
> I guess there was already some discussion on this test, but we did
> not receive all of the thread(s). Please always add at least
> linux-s...@vger.kernel.org and maybe myself and Vasily Gorbik 
> 
> for further discussions.

IIRC, the V3 series previously had all these addresses copied properly
but this version once again missed copying all required addresses.

> 
> That being said, sorry for duplications, this might already have been
> discussed. Preliminary analysis showed that it only seems to go wrong
> for certain random vaddr values. I cannot make any sense of that yet,
> but what seems strange to me is that the hugetlb_advanced_tests()
> take a (real) pte_t pointer as input, and also use that for all
> kinds of operations (set_huge_pte_at, huge_ptep_get_and_clear, etc.).
> 
> Although all the hugetlb code in the kernel is (mis)using pte_t
> pointers instead of the correct pmd/pud_t pointers like THP, that
> is just for historic reasons. The pointers will actually never point
> to a real pte_t (i.e. page table entry), but of course to a pmd
> or pud entry, depending on hugepage size.

HugeTLB logically operates on a PTE entry irrespective of it's real
page table level position. Nonetheless, IIUC, vaddr here should have
been aligned to real page table level in which the entry is being
mapped currently.

> 
> What is passed in as 

Re: [PATCH v2 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-09 Thread David Hildenbrand
On 09.09.20 09:17, Greg Kroah-Hartman wrote:
> On Tue, Sep 08, 2020 at 10:10:08PM +0200, David Hildenbrand wrote:
>> We soon want to pass flags, e.g., to mark added System RAM resources.
>> mergeable. Prepare for that.
> 
> What are these random "flags", and how do we know what should be passed
> to them?
> 
> Why not make this an enumerated type so that we know it all works
> properly, like the GPF_* flags are?  Passing around a random unsigned
> long feels very odd/broken...

Agreed, an enum (mhp_flags) seems to give a better hint what can
actually be passed. Thanks!

-- 
Thanks,

David / dhildenb



Re: [PATCH v2 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-09 Thread Greg Kroah-Hartman
On Tue, Sep 08, 2020 at 10:10:08PM +0200, David Hildenbrand wrote:
> We soon want to pass flags, e.g., to mark added System RAM resources.
> mergeable. Prepare for that.

What are these random "flags", and how do we know what should be passed
to them?

Why not make this an enumerated type so that we know it all works
properly, like the GPF_* flags are?  Passing around a random unsigned
long feels very odd/broken...

thanks,

greg k-h


Re: [PATCH v1 4/5] powerpc/fault: Avoid heavy search_exception_tables() verification

2020-09-09 Thread Christophe Leroy




Le 09/09/2020 à 08:04, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


search_exception_tables() is an heavy operation, we have to avoid it.
When KUAP is selected, we'll know the fault has been blocked by KUAP.
Otherwise, it behaves just as if the address was already in the TLBs
and no fault was generated.

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/fault.c | 20 +---
  1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 525e0c2b5406..edde169ba3a6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -214,24 +214,14 @@ static bool bad_kernel_fault(struct pt_regs *regs, 
unsigned long error_code,
if (address >= TASK_SIZE)
return true;
  
-	if (!is_exec && (error_code & DSISR_PROTFAULT) &&

-   !search_exception_tables(regs->nip)) {
+   // Read/write fault blocked by KUAP is bad, it can never succeed.
+   if (bad_kuap_fault(regs, address, is_write)) {
pr_crit_ratelimited("Kernel attempted to access user page (%lx) - 
exploit attempt? (uid: %d)\n",
-   address,
-   from_kuid(_user_ns, current_uid()));
-   }
-
-   // Fault on user outside of certain regions (eg. copy_tofrom_user()) is 
bad
-   if (!search_exception_tables(regs->nip))
-   return true;


We still need to keep this ? Without that we detect the lack of
exception tables pretty late.


Is that a problem at all to detect the lack of exception tables late ?
That case is very unlikely and will lead to failure anyway. So, is it 
worth impacting performance of the likely case which will always have an 
exception table and where we expect the exception to run as fast as 
possible ?


The other architectures I have looked at (arm64 and x86) only have the 
exception table search together with the down_read_trylock(>mmap_sem).


Christophe


Re: [PATCH v2] powerpc/pci: unmap legacy INTx interrupts when a PHB is removed

2020-09-09 Thread Alexey Kardashevskiy



On 07/08/2020 20:18, Cédric Le Goater wrote:
> When a passthrough IO adapter is removed from a pseries machine using
> hash MMU and the XIVE interrupt mode, the POWER hypervisor expects the
> guest OS to clear all page table entries related to the adapter. If
> some are still present, the RTAS call which isolates the PCI slot
> returns error 9001 "valid outstanding translations" and the removal of
> the IO adapter fails. This is because when the PHBs are scanned, Linux
> maps automatically the INTx interrupts in the Linux interrupt number
> space but these are never removed.
> 
> To solve this problem, we introduce a PPC platform specific
> pcibios_remove_bus() routine which clears all interrupt mappings when
> the bus is removed. This also clears the associated page table entries
> of the ESB pages when using XIVE.
> 
> For this purpose, we record the logical interrupt numbers of the
> mapped interrupt under the PHB structure and let pcibios_remove_bus()
> do the clean up.
> 
> Since some PCI adapters, like GPUs, use the "interrupt-map" property
> to describe interrupt mappings other than the legacy INTx interrupts,
> we can not restrict the size of the mapping array to PCI_NUM_INTX. The
> number of interrupt mappings is computed from the "interrupt-map"
> property and the mapping array is allocated accordingly.
> 
> Cc: "Oliver O'Halloran" 
> Cc: Alexey Kardashevskiy 
> Signed-off-by: Cédric Le Goater 

I thought we could reuse some of the common OF code for the DT parsing
but we cannot (easily) so it is good as it is:

Reviewed-by: Alexey Kardashevskiy 


> ---
> 
>  Changes since v2:
> 
>  - merged 2 patches.
>  
>  arch/powerpc/include/asm/pci-bridge.h |   6 ++
>  arch/powerpc/kernel/pci-common.c  | 114 ++
>  2 files changed, 120 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/pci-bridge.h 
> b/arch/powerpc/include/asm/pci-bridge.h
> index b92e81b256e5..ca75cf264ddf 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -48,6 +48,9 @@ struct pci_controller_ops {
>  
>  /*
>   * Structure of a PCI controller (host bridge)
> + *
> + * @irq_count: number of interrupt mappings
> + * @irq_map: interrupt mappings
>   */
>  struct pci_controller {
>   struct pci_bus *bus;
> @@ -127,6 +130,9 @@ struct pci_controller {
>  
>   void *private_data;
>   struct npu *npu;
> +
> + unsigned int irq_count;
> + unsigned int *irq_map;
>  };
>  
>  /* These are used for config access before all the PCI probing
> diff --git a/arch/powerpc/kernel/pci-common.c 
> b/arch/powerpc/kernel/pci-common.c
> index be108616a721..deb831f0ae13 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -353,6 +353,115 @@ struct pci_controller 
> *pci_find_controller_for_domain(int domain_nr)
>   return NULL;
>  }
>  
> +/*
> + * Assumption is made on the interrupt parent. All interrupt-map
> + * entries are considered to have the same parent.
> + */
> +static int pcibios_irq_map_count(struct pci_controller *phb)
> +{
> + const __be32 *imap;
> + int imaplen;
> + struct device_node *parent;
> + u32 intsize, addrsize, parintsize, paraddrsize;
> +
> + if (of_property_read_u32(phb->dn, "#interrupt-cells", ))
> + return 0;
> + if (of_property_read_u32(phb->dn, "#address-cells", ))
> + return 0;
> +
> + imap = of_get_property(phb->dn, "interrupt-map", );
> + if (!imap) {
> + pr_debug("%pOF : no interrupt-map\n", phb->dn);
> + return 0;
> + }
> + imaplen /= sizeof(u32);
> + pr_debug("%pOF : imaplen=%d\n", phb->dn, imaplen);
> +
> + if (imaplen < (addrsize + intsize + 1))
> + return 0;
> +
> + imap += intsize + addrsize;
> + parent = of_find_node_by_phandle(be32_to_cpup(imap));
> + if (!parent) {
> + pr_debug("%pOF : no imap parent found !\n", phb->dn);
> + return 0;
> + }
> +
> + if (of_property_read_u32(parent, "#interrupt-cells", )) {
> + pr_debug("%pOF : parent lacks #interrupt-cells!\n", phb->dn);
> + return 0;
> + }
> +
> + if (of_property_read_u32(parent, "#address-cells", ))
> + paraddrsize = 0;
> +
> + return imaplen / (addrsize + intsize + 1 + paraddrsize + parintsize);
> +}
> +
> +static void pcibios_irq_map_init(struct pci_controller *phb)
> +{
> + phb->irq_count = pcibios_irq_map_count(phb);
> + if (phb->irq_count < PCI_NUM_INTX)
> + phb->irq_count = PCI_NUM_INTX;
> +
> + pr_debug("%pOF : interrupt map #%d\n", phb->dn, phb->irq_count);
> +
> + phb->irq_map = kcalloc(phb->irq_count, sizeof(unsigned int),
> +GFP_KERNEL);
> +}
> +
> +static void pci_irq_map_register(struct pci_dev *pdev, unsigned int virq)
> +{
> + struct pci_controller *phb = pci_bus_to_host(pdev->bus);
> + int i;
> +
> + if (!phb->irq_map)
> + 

Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-09 Thread Aneesh Kumar K.V
Gerald Schaefer  writes:

> On Fri, 4 Sep 2020 18:01:15 +0200
> Gerald Schaefer  wrote:
>
> [...]
>> 
>> BTW2, a quick test with this change (so far) made the issues on s390
>> go away:
>> 
>> @@ -1069,7 +1074,7 @@ static int __init debug_vm_pgtable(void)
>> spin_unlock(ptl);
>> 
>>  #ifndef CONFIG_PPC_BOOK3S_64
>> -   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>> +   hugetlb_advanced_tests(mm, vma, (pte_t *) pmdp, pmd_aligned, vaddr, 
>> prot);
>>  #endif
>> 
>> spin_lock(>page_table_lock);
>> 
>> That would more match the "pte_t pointer" usage for hugetlb code,
>> i.e. just cast a pmd_t pointer to it. Also changed to pmd_aligned,
>> but I think the root cause is the pte_t pointer.
>> 
>> Not entirely sure though if that would really be the correct fix.
>> I somehow lost whatever little track I had about what these tests
>> really want to check, and if that would still be valid with that
>> change.
>
> Uh oh, wasn't aware that this (or some predecessor) already went
> upstream, and broke our debug kernel today.

Not sure i followed the above. Are you finding that s390 kernel crash
after this patch series or the original patchset? As noted in my patch
the hugetlb test is broken and we should fix that. A quick fix is to
comment out that test for s390 too as i have done for PPC64.


-aneesh


Re: [PATCH v1 4/5] powerpc/fault: Avoid heavy search_exception_tables() verification

2020-09-09 Thread Aneesh Kumar K.V
Christophe Leroy  writes:

> search_exception_tables() is an heavy operation, we have to avoid it.
> When KUAP is selected, we'll know the fault has been blocked by KUAP.
> Otherwise, it behaves just as if the address was already in the TLBs
> and no fault was generated.
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/mm/fault.c | 20 +---
>  1 file changed, 5 insertions(+), 15 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 525e0c2b5406..edde169ba3a6 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -214,24 +214,14 @@ static bool bad_kernel_fault(struct pt_regs *regs, 
> unsigned long error_code,
>   if (address >= TASK_SIZE)
>   return true;
>  
> - if (!is_exec && (error_code & DSISR_PROTFAULT) &&
> - !search_exception_tables(regs->nip)) {
> + // Read/write fault blocked by KUAP is bad, it can never succeed.
> + if (bad_kuap_fault(regs, address, is_write)) {
>   pr_crit_ratelimited("Kernel attempted to access user page (%lx) 
> - exploit attempt? (uid: %d)\n",
> - address,
> - from_kuid(_user_ns, current_uid()));
> - }
> -
> - // Fault on user outside of certain regions (eg. copy_tofrom_user()) is 
> bad
> - if (!search_exception_tables(regs->nip))
> - return true;

We still need to keep this ? Without that we detect the lack of
exception tables pretty late.



> -
> - // Read/write fault in a valid region (the exception table search passed
> - // above), but blocked by KUAP is bad, it can never succeed.
> - if (bad_kuap_fault(regs, address, is_write))
> + address, from_kuid(_user_ns, 
> current_uid()));
>   return true;
> + }
>  
> - // What's left? Kernel fault on user in well defined regions (extable
> - // matched), and allowed by KUAP in the faulting context.
> + // What's left? Kernel fault on user and allowed by KUAP in the 
> faulting context.
>   return false;
>  }
>  
> -- 
> 2.25.0