from:"Ganesh"

[PATCH v3] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-06-17 Thread Ganesh Goudar

If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev->bus.

Signed-off-by: Ganesh Goudar 
---
v2: Hold rescan lock till we get the bus address.
v3: Now that we are taking copy of bus, holding the lock, update the
commit message accordingly.
---
 arch/powerpc/kernel/eeh_pe.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..d283d281d28e 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 {
struct eeh_dev *edev;
struct pci_dev *pdev;
+   struct pci_bus *bus = NULL;
 
if (pe->type & EEH_PE_PHB)
return pe->phb->bus;
@@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
if (pdev)
-   return pdev->bus;
+   bus = pdev->bus;
+   pci_unlock_rescan_remove();
 
-   return NULL;
+   return bus;
 }
-- 
2.44.0

[PATCH v2 0/1] Parallel EEH recovery between PHBs

2024-06-17 Thread Ganesh Goudar

This change is based on Sam Bobroff's patches which aimed
to allow recovery to happen in parallel between PHBs and PEs,
Due to various reasons the patches did not get in.

But having parallel recovery between PHBs is fairly simple and
gives significant improvement on powervm, Since powervm maintains
flat hierarchy for PCI devices.
This patch enables PHBs to have separate event queues and shorten
the time taken for EEH recovery by making the recovery to run in
parallel between PHBs.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery. On powernv the improvement
is not so significant.

Ganesh Goudar (1):
  powerpc/eeh: Enable PHBs to recovery in parallel

 arch/powerpc/include/asm/eeh_event.h  |  7 
 arch/powerpc/include/asm/pci-bridge.h |  4 ++
 arch/powerpc/kernel/eeh_driver.c  | 27 +++-
 arch/powerpc/kernel/eeh_event.c   | 59 +--
 arch/powerpc/kernel/eeh_pe.c  |  4 ++
 5 files changed, 78 insertions(+), 23 deletions(-)

-- 
2.44.0

[PATCH v2 1/1] powerpc/eeh: Enable PHBs to recovery in parallel

2024-06-17 Thread Ganesh Goudar

Currnetly, with a single event queue EEH recovery is entirely
serialized and takes place within a single kernel thread. This
can cause recovery to take a long time when there are many
devices.

Have the recovery event queue per PHB and allow the recovery to
happen in parallel for all the PHBs.

Signed-off-by: Ganesh Goudar 
---
v2: Include missing hunk, which modifies __eeh_send_failure_event.
---
 arch/powerpc/include/asm/eeh_event.h  |  7 
 arch/powerpc/include/asm/pci-bridge.h |  4 ++
 arch/powerpc/kernel/eeh_driver.c  | 27 +++-
 arch/powerpc/kernel/eeh_event.c   | 59 +--
 arch/powerpc/kernel/eeh_pe.c  |  4 ++
 5 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..6af1b5bb6103 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,15 +17,20 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7efe04c68f0f..4cf5fd409369 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1116,6 +1116,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
+void eeh_handle_normal_event_work(struct work_struct *work)
+{
+   unsigned long flags;
+   struct eeh_event *event = container_of(work, struct eeh_event, work);
+   struct pci_controller *phb = event->pe->phb;
+
+   eeh_handle_normal_event(event->pe);
+
+   kfree(event);
+   spin_lock_irqsave(>eeh_eventlist_lock, flags);
+   WARN_ON_ONCE(!phb->eeh_in_progress);
+   if (list_empty(>eeh_eventlist)) {
+   phb->eeh_in_progress = false;
+   pr_debug("EEH: No more work to do\n");
+   } else {
+   pr_warn("EEH: More work to do\n");
+   event = list_entry(phb->eeh_eventlist.next,
+  struct eeh_event, list);
+   list_del(>list);
+   queue_work(system_unbound_wq, >work);
+   }
+   spin_unlock_irqrestore(>eeh_eventlist_lock, flags);
+}
+
 /**
  * eeh_handle_special_event - Handle EEH events without a specific failing PE
  *
@@ -1185,8 +1209,7 @@ void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
-   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-   eeh_handle_normal_event(pe);
+   eeh_phb_event(pe);
} else {
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index c23a454af08a..8a9d6358d39f 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -22,7 +22,7 @@
  *  work-queue, where a worker thread can drive recovery.
  */
 
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
+DEFINE_SPINLOCK(eeh_eventlist_lock);
 static DECLARE_COMPLETION(eeh_eventlist_event);
 static LIST_HEAD(eeh_eventlist);
 
@@ -91,6 +91,42 @@ int eeh_event_init(void)
return 0;
 }
 
+int eeh_phb_event(struct eeh_pe *pe)
+{
+   struct eeh_event *event;
+   unsigned long flags;
+   struct pci_controller *phb;
+
+   event = kzalloc(sizeof(*event), GFP_ATOMIC);
+   if (!event)
+   return -ENOMEM;
+
+   if (pe) {
+

[PATCH v2] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-06-13 Thread Ganesh Goudar

If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev.

Signed-off-by: Ganesh Goudar 
---
v2: Hold rescan lock till we get the bus address.
---
 arch/powerpc/kernel/eeh_pe.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..d283d281d28e 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 {
struct eeh_dev *edev;
struct pci_dev *pdev;
+   struct pci_bus *bus = NULL;
 
if (pe->type & EEH_PE_PHB)
return pe->phb->bus;
@@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
if (pdev)
-   return pdev->bus;
+   bus = pdev->bus;
+   pci_unlock_rescan_remove();
 
-   return NULL;
+   return bus;
 }
-- 
2.44.0

Re: [PATCH] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-06-13 Thread Ganesh G R


On 6/11/24 8:18 AM, Michael Ellerman wrote:


Hi Ganesh,

Ganesh Goudar  writes:

If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev.

Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/kernel/eeh_pe.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..49f968733912 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -859,7 +859,9 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
  
  	/* Retrieve the parent PCI bus of first (top) PCI device */

edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
+   pci_unlock_rescan_remove();
if (pdev)
return pdev->bus;

What prevents pdev being freed/reused immediately after you drop the
rescan/remove lock?


Yeah, I should have released the lock after getting bus address, I will send v2.


AFAICS eeh_dev_to_pci_dev() doesn't take an additional reference to the
pdev or anything.


Yes, I think we have to evaluate the possible eventualities of not taking the 
reference
in all the cases.
But we need this lock here because, if the PCI error is encountered in the 
hotplug remove
path, we need the pci rescan lock to avoid race between hotplug remove path and 
the bottom
half of EEH recovery, this lets the hotplug remove to complete since it is 
already holding
the lock and drop the recovery process as the device is no longer present.

[PATCH] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-05-27 Thread Ganesh Goudar

If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh_pe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..49f968733912 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -859,7 +859,9 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
+   pci_unlock_rescan_remove();
if (pdev)
return pdev->bus;
 
-- 
2.44.0

[PATCH v2] powerpc/eeh: Permanently disable the removed device

2024-04-22 Thread Ganesh Goudar

When a device is hot removed on powernv, the hotplug driver clears
the device's state. However, on pseries, if a device is removed by
phyp after reaching the error threshold, the kernel remains unaware,
leading to the device not being torn down. This prevents necessary
remediation actions like failover.

Permanently disable the device if the presence check fails.

Also, in eeh_dev_check_failure in we may consider the error as false
positive if the device is hotpluged out as the get_state call returns
EEH_STATE_NOT_SUPPORT and we may end up not clearing the device state,
so log the event if the state is not moved to permanent failure state.

Signed-off-by: Ganesh Goudar 
---
V2:
* Elobrate the commit message.
* Fix formatting issues in commit message and comments.
---
 arch/powerpc/kernel/eeh.c| 11 ++-
 arch/powerpc/kernel/eeh_driver.c | 13 +++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..6670063a7a6c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -506,9 +506,18 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * We will punt with the following conditions: Failure to get
 * PE's state, EEH not support and Permanently unavailable
 * state, PE is in good state.
+*
+* On the pSeries, after reaching the threshold, get_state might
+* return EEH_STATE_NOT_SUPPORT. However, it's possible that the
+* device state remains uncleared if the device is not marked
+* pci_channel_io_perm_failure. Therefore, consider logging the
+* event to let device removal happen.
+*
 */
if ((ret < 0) ||
-   (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+   (ret == EEH_STATE_NOT_SUPPORT &&
+dev->error_state == pci_channel_io_perm_failure) ||
+   eeh_state_active(ret)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..7efe04c68f0f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -865,9 +865,18 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
devices++;
 
if (!devices) {
-   pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
+   pr_warn("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
-   goto out; /* nothing to recover */
+   /*
+* The device is removed, tear down its state, on powernv
+* hotplug driver would take care of it but not on pseries,
+* permanently disable the card as it is hot removed.
+*
+* In the case of powernv, note that the removal of device
+* is covered by pci rescan lock, so no problem even if hotplug
+* driver attempts to remove the device.
+*/
+   goto recover_failed;
}
 
/* Log the event */
-- 
2.44.0

Re: [PATCH] powerpc/eeh: Permanently disable the removed device

2024-04-15 Thread Ganesh G R


On 4/9/24 14:37, Michael Ellerman wrote:


Hi Ganesh,

Ganesh Goudar  writes:

When a device is hot removed on powernv, the hotplug
driver clears the device's state. However, on pseries,
if a device is removed by phyp after reaching the error
threshold, the kernel remains unaware, leading to the
device not being torn down. This prevents necessary
remediation actions like failover.

Permanently disable the device if the presence check
fails.

You can wrap your changelogs a bit wider, 70 or 80 columns is fine.


ok


diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..8d1606406d3f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -508,7 +508,9 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * state, PE is in good state.
 */
if ((ret < 0) ||
-   (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+   (ret == EEH_STATE_NOT_SUPPORT &&
+dev->error_state == pci_channel_io_perm_failure) ||
+   eeh_state_active(ret)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;

How does this hunk relate the changelog?

This is adding an extra condition to the false positive check, so
there's a risk this causes devices to go into failure when previously
they didn't, right? So please explain why it's a good change. The
comment above the if needs updating too.


We need this change to log the event and get the device removed, I will explain 
this
in commit message.


diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..10317badf471 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -867,7 +867,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (!devices) {
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
-   goto out; /* nothing to recover */

The other cases that go to recover_failed usually print something at
warn level, so this probably should too. So either make the above a
pr_warn(), or change it to a warn with a more helpful message.


ok


+   /*
+* The device is removed, Tear down its state,
+* On powernv hotplug driver would take care of
+* it but not on pseries, Permanently disable the
+* card as it is hot removed.
+*/

Formatting and punctuation is weird. It can be wider, and capital letter
is only required after a full stop, not a comma.


ok, i will take care of it.


Also you say that the powernv hotplug driver "would" take care of it,
that's past tense, is that what you mean? Does the powernv hotplug
driver still take care of it after this change? And (how) does that
driver cope with it happening here also?


Yes, hotplug driver can still remove the device and the removal of
device is covered by pci rescan lock.


+   goto recover_failed;
}


cheers

[PATCH] powerpc/eeh: Permanently disable the removed device

2024-04-05 Thread Ganesh Goudar

When a device is hot removed on powernv, the hotplug
driver clears the device's state. However, on pseries,
if a device is removed by phyp after reaching the error
threshold, the kernel remains unaware, leading to the
device not being torn down. This prevents necessary
remediation actions like failover.

Permanently disable the device if the presence check
fails.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh.c| 4 +++-
 arch/powerpc/kernel/eeh_driver.c | 8 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..8d1606406d3f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -508,7 +508,9 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * state, PE is in good state.
 */
if ((ret < 0) ||
-   (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+   (ret == EEH_STATE_NOT_SUPPORT &&
+dev->error_state == pci_channel_io_perm_failure) ||
+   eeh_state_active(ret)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..10317badf471 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -867,7 +867,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (!devices) {
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
-   goto out; /* nothing to recover */
+   /*
+* The device is removed, Tear down its state,
+* On powernv hotplug driver would take care of
+* it but not on pseries, Permanently disable the
+* card as it is hot removed.
+*/
+   goto recover_failed;
}
 
/* Log the event */
-- 
2.44.0

[PATCH 1/1] powerpc/eeh: Enable PHBs to recovery in parallel

2024-02-25 Thread Ganesh Goudar

Currnetly, With a single event queue EEH recovery is entirely
serialized and takes place within a single kernel thread. This
can cause recovery to take a long time when there are many
devices.

Have the recovery event queue per PHB and allow the recovery to
happen independently from other PHBs.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h  |  7 +
 arch/powerpc/include/asm/pci-bridge.h |  4 +++
 arch/powerpc/kernel/eeh_driver.c  | 27 +--
 arch/powerpc/kernel/eeh_event.c   | 38 ++-
 arch/powerpc/kernel/eeh_pe.c  |  4 +++
 5 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..6af1b5bb6103 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,15 +17,20 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..d5612303766e 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1107,6 +1107,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
+void eeh_handle_normal_event_work(struct work_struct *work)
+{
+   unsigned long flags;
+   struct eeh_event *event = container_of(work, struct eeh_event, work);
+   struct pci_controller *phb = event->pe->phb;
+
+   eeh_handle_normal_event(event->pe);
+
+   kfree(event);
+   spin_lock_irqsave(>eeh_eventlist_lock, flags);
+   WARN_ON_ONCE(!phb->eeh_in_progress);
+   if (list_empty(>eeh_eventlist)) {
+   phb->eeh_in_progress = false;
+   pr_debug("EEH: No more work to do\n");
+   } else {
+   pr_warn("EEH: More work to do\n");
+   event = list_entry(phb->eeh_eventlist.next,
+  struct eeh_event, list);
+   list_del(>list);
+   queue_work(system_unbound_wq, >work);
+   }
+   spin_unlock_irqrestore(>eeh_eventlist_lock, flags);
+}
+
 /**
  * eeh_handle_special_event - Handle EEH events without a specific failing PE
  *
@@ -1176,8 +1200,7 @@ void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
-   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-   eeh_handle_normal_event(pe);
+   eeh_phb_event(pe);
} else {
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index c23a454af08a..86c0a988389e 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -22,7 +22,7 @@
  *  work-queue, where a worker thread can drive recovery.
  */
 
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
+DEFINE_SPINLOCK(eeh_eventlist_lock);
 static DECLARE_COMPLETION(eeh_eventlist_event);
 static LIST_HEAD(eeh_eventlist);
 
@@ -91,6 +91,42 @@ int eeh_event_init(void)
return 0;
 }
 
+int eeh_phb_event(struct eeh_pe *pe)
+{
+   struct eeh_event *event;
+   unsigned long flags;
+   struct pci_controller *phb;
+
+   event = kzalloc(sizeof(*event), GFP_ATOMIC);
+   if (!event)
+   return -ENOMEM;
+
+   if (pe) {
+   phb = pe->phb;
+   event->pe = pe;
+

[PATCH 0/1] Parallel EEH recovery between PHBs

2024-02-25 Thread Ganesh Goudar

This change is based on Sam Bobroff's patches which aimed
to allow recovery to happen in parallel between PHBs and PEs,
Due to various reasons the patches did not get in.

But having parallel recovery between PHBs is fairly simple and
gives significant improvement on powervm, Since powervm maintains
flat hierarchy for PCI devices.
This patch enables PHBs to have separate event queues and shorten
the time taken for EEH recovery by making the recovery to run in
parallel between PHBs.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery. On powernv the improvement
is not so significant.

Ganesh Goudar (1):
  powerpc/eeh: Enable PHBs to recovery in parallel

 arch/powerpc/include/asm/eeh_event.h  |  7 +
 arch/powerpc/include/asm/pci-bridge.h |  4 +++
 arch/powerpc/kernel/eeh_driver.c  | 27 +--
 arch/powerpc/kernel/eeh_event.c   | 38 ++-
 arch/powerpc/kernel/eeh_pe.c  |  4 +++
 5 files changed, 77 insertions(+), 3 deletions(-)

-- 
2.43.2

[RFC PATCH v2 3/3] powerpc/eeh: Asynchronous recovery

2023-07-24 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   4 +
 arch/powerpc/kernel/eeh.c |   5 -
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  69 +++---
 arch/powerpc/kernel/eeh_pe.c  |   4 +
 7 files changed, 294 insertions(+), 119 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 95708c801f27..a99472635350 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 733fb290f4b7..12536d892826 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -579,11 +579,6 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * bridges.
 */
eeh_pe_mark_isolated(pe);
-
-   /* Most EEH events are due to device driver bugs.  Having
-* a stack trace will help the device-driver authors figure
-* out what happened.  So print that out.
-*/
pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
__func__, pe->phb->global_number, pe->addr);
eeh_send_failure_event(pe);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cdf2de0eba57..49f8b99dfb25 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_re

[RFC PATCH v2 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2023-07-24 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 189 +++
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 include/linux/mmzone.h   |   2 +-
 6 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index d9fcff575027..5b82e76dbd19 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 6907722c6c1e..733fb290f4b7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -194,7 +194,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -204,27 +205,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, );
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, );
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, );
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -232,18 +235,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {
eeh_ops->

[RFC PATCH v2 1/3] powerpc/eeh: Synchronization for safety

2023-07-24 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |  12 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 288 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 10 files changed, 365 insertions(+), 116 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..95708c801f27 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -233,7 +233,7 @@ extern int eeh_subsystem_flags;
 extern u32 eeh_max_freezes;
 extern bool eeh_debugfs_no_recover;
 extern struct eeh_ops *eeh_ops;
-extern raw_spinlock_t confirm_error_lock;
+extern raw_spinlock_t eeh_pe_tree_spinlock;
 
 static inline void eeh_add_flag(int flag)
 {
@@ -257,12 +257,12 @@ static inline bool eeh_enabled(void)
 
 static inline void eeh_serialize_lock(unsigned long *flags)
 {
-   raw_spin_lock_irqsave(_error_lock, *flags);
+   raw_spin_lock_irqsave(_pe_tree_spinlock, *flags);
 }
 
 static inline void eeh_serialize_unlock(unsigned long flags)
 {
-   raw_spin_unlock_irqrestore(_error_lock, flags);
+   raw_spin_unlock_irqrestore(_pe_tree_spinlock, flags);
 }
 
 static inline bool eeh_state_active(int state)
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c

[RFC PATCH v2 0/3] Asynchronous EEH recovery

2023-07-24 Thread Ganesh Goudar

Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

Thanks.

V2:
 * Since we now have event list per phb, Have per phb event list lock.
 * Appropriate names given to the locks.
 * Remove stale comments (few more to be removed).
 * Initialize event_id to 0 instead of 1.
 * And some cosmetic changes.

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |  13 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   4 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 159 +++--
 arch/powerpc/kernel/eeh_driver.c | 580 +++
 arch/powerpc/kernel/eeh_event.c  |  75 ++-
 arch/powerpc/kernel/eeh_pe.c |  34 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 include/linux/mmzone.h   |   2 +-
 15 files changed, 693 insertions(+), 225 deletions(-)

-- 
2.40.1

Re: [RFC 0/3] Asynchronous EEH recovery

2023-07-17 Thread Ganesh G R



On 6/13/23 8:06 AM, Oliver O'Halloran wrote:


On Tue, Jun 13, 2023 at 11:44 AM Ganesh Goudar  wrote:

Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

What changes have you made since the last time you posted this series?
If the patches are the same then the comments I posted last time still
apply.


Hi Oliver, You asked about the way we are testing this on powervm, You expressed
concerns about having this on powernv, suggested to have this feature just for
powervm for now, and also expressed concerns on having two locks.

On powervm using two port card we are instantiating 64 VFS, for an lpar and 
injecting
the error on the bus from phyp, to observe the behavior.
I was able to test this on powernv with 16 PFs from 8 cards installed on 
separate PHBs,
Where I saw considerable performance improvement.
Regarding two locks idea, I may not have tested it for all scenarios, So far I 
have not
faced any issue, Are you suggesting a different approach.

Thanks

[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2023-06-12 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 189 +++
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 include/linux/mmzone.h   |   2 +-
 6 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index d9fcff575027..5b82e76dbd19 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2c90c37524ed..148d5df0e606 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, );
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, );
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, );
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {
eeh_ops->

[RFC 3/3] powerpc/eeh: Asynchronous recovery

2023-06-12 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   3 +
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  65 +++---
 arch/powerpc/kernel/eeh_pe.c  |   3 +
 6 files changed, 288 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d0f09e691498..06d7dabdccfe 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..55a5ff9ae30b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,9 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cdf2de0eba57..a484d6ef33a1 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id,
+unsigned int id,
 struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_pdev(unsigned int event_id,
-  struct pci_dev *pdev, eeh_report_fn fn,
+  unsigned int id,
+  struct pci_dev *pdev,
+  const char *fn_name, eeh_report_fn fn,
   enum pci_ers_result *result,
-  const char *handler_name)
+  bool late, bool removed, bool passed)
 {
-   struct eeh_dev *edev;
struct pci_driv

[RFC 1/3] powerpc/eeh: Synchronization for safety

2023-06-12 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |   6 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 288 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 10 files changed, 365 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..d0f09e691498 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..2c90c37524ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover;
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-/* Lock to avoid races due to multiple reports of an error */
+/*
+ * confirm_error_lock and eeh_dev_mutex are used together to provide
+ * safety during EEH operations.
+ *
+ * Generally, the spinlock is used in error detection where it's not possible
+ * to use a mutex or where there is potential to deadlock with the mutex, and
+ * the mutex is used during recovery and other PCI related operations. One must
+ * be held when reading and both must be held when making changes to the
+ * protected fields: eeh_pe.parent

[RFC 0/3] Asynchronous EEH recovery

2023-06-12 Thread Ganesh Goudar

Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

Thanks.  

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |   7 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   3 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 154 +++--
 arch/powerpc/kernel/eeh_driver.c | 580 +++
 arch/powerpc/kernel/eeh_event.c  |  71 ++-
 arch/powerpc/kernel/eeh_pe.c |  33 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 include/linux/mmzone.h   |   2 +-
 15 files changed, 687 insertions(+), 214 deletions(-)

-- 
2.40.1

[PATCH] powerpc/eeh: Set channel state after notifying the drivers

2023-02-09 Thread Ganesh Goudar

When a PCI error is encountered 6th time in an hour we
set the channel state to perm_failure and notify the
driver about the permanent failure.

However, after upstream commit 38ddc011478e ("powerpc/eeh:
Make permanently failed devices non-actionable"), EEH handler
stops calling any routine once the device is marked as
permanent failure. This issue can lead to fatal consequences
like kernel hang with certain PCI devices.

Following log is observed with lpfc driver, with and without
this change, Without this change kernel hangs, If PCI error
is encountered 6 times for a device in an hour.

Without the change

 EEH: Beginning: 'error_detected(permanent failure)'
 PCI 0132:60:00.0#60: EEH: not actionable (1,1,1)
 PCI 0132:60:00.1#60: EEH: not actionable (1,1,1)
 EEH: Finished:'error_detected(permanent failure)'

With the change

 EEH: Beginning: 'error_detected(permanent failure)'
 EEH: Invoking lpfc->error_detected(permanent failure)
 EEH: lpfc driver reports: 'disconnect'
 EEH: Invoking lpfc->error_detected(permanent failure)
 EEH: lpfc driver reports: 'disconnect'
 EEH: Finished:'error_detected(permanent failure)'

To fix the issue, set channel state to permanent failure after
notifying the drivers.

Fixes: 38ddc011478e ("powerpc/eeh: Make permanently failed devices 
non-actionable")
Suggested-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh_driver.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index f279295179bd..438568a472d0 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1065,10 +1065,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
/* Notify all devices that they're about to go down. */
-   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe,
  eeh_report_failure, NULL);
+   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
 
/* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED);
@@ -1185,10 +1185,10 @@ void eeh_handle_special_event(void)
 
/* Notify all devices to be down */
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
-   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_pe_report(
"error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
+   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
 
pci_lock_rescan_remove();
list_for_each_entry(hose, _list, list_node) {
-- 
2.39.1

Re: [PATCH v2] powerpc/mce: log the error for all unrecoverable errors

2023-02-01 Thread Ganesh G R



On 1/31/23 4:59 PM, Michael Ellerman wrote:


Ganesh Goudar  writes:

For all unrecoverable errors we are missing to log the
error, Since machine_check_log_err() is not getting called
for unrecoverable errors.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

But the patch also removes the irq work raise from machine_check_ue_event().

That's currently done unconditionally, regardless of the disposition. So
doesn't this change also drop logging of recoverable UEs?

Maybe that's OK, but the change log should explain it.


Yes, its ok, exception vector code will do that for recoverable errors, ill 
explain
this in commit message.




Log without this change

  MCE: CPU27: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
  MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4]
  MCE: CPU27: Initiator CPU
  MCE: CPU27: Unknown

Log with this change

  MCE: CPU24: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
  MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48]
  MCE: CPU24: Initiator CPU
  MCE: CPU24: Unknown
  RTAS: event: 5, Type: Platform Error (224), Severity: 3

Signed-off-by: Ganesh Goudar
Reviewed-by: Mahesh Salgaonkar
---
V2: Rephrasing the commit message.
---
  arch/powerpc/kernel/mce.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
  
+	/*

+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
  
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct machine_check_event *evt)

   evt, sizeof(*evt));
  
  	/* Queue work to process this event later. */

This comment is meaningless without the function call it's commenting
about, ie. the comment should be removed too.


ok.

Thanks.

[PATCH v3] powerpc/mce: log the error for all unrecoverable errors

2023-02-01 Thread Ganesh Goudar

For all unrecoverable errors we are missing to log the
error, Since machine_check_log_err() is not getting called
for unrecoverable errors. machine_check_log_err() is called
from deferred handler, To run deferred handlers we have to do
irq work raise from the exception handler.

For recoverable errors exception vector code takes care of
running deferred handlers.

For unrecoverable errors raise irq work in save_mce_event(),
So that we log the error from MCE deferred handler.

Log without this change

 MCE: CPU27: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4]
 MCE: CPU27: Initiator CPU
 MCE: CPU27: Unknown

Log with this change

 MCE: CPU24: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48]
 MCE: CPU24: Initiator CPU
 MCE: CPU24: Unknown
 RTAS: event: 5, Type: Platform Error (224), Severity: 3

Signed-off-by: Ganesh Goudar 
Reviewed-by: Mahesh Salgaonkar 
---
V3: Rephrasing the commit message.
---
 arch/powerpc/kernel/mce.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..219f28637a3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -233,9 +240,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
}
memcpy(_paca->mce_info->mce_ue_event_queue[index],
   evt, sizeof(*evt));
-
-   /* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.39.1

[PATCH v2] powerpc/mce: log the error for all unrecoverable errors

2023-01-27 Thread Ganesh Goudar

For all unrecoverable errors we are missing to log the
error, Since machine_check_log_err() is not getting called
for unrecoverable errors.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

Log without this change

 MCE: CPU27: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4]
 MCE: CPU27: Initiator CPU
 MCE: CPU27: Unknown

Log with this change

 MCE: CPU24: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48]
 MCE: CPU24: Initiator CPU
 MCE: CPU24: Unknown
 RTAS: event: 5, Type: Platform Error (224), Severity: 3

Signed-off-by: Ganesh Goudar 
Reviewed-by: Mahesh Salgaonkar 
---
V2: Rephrasing the commit message.
---
 arch/powerpc/kernel/mce.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.38.1

[PATCH] powerpc/mce: log the error for all unrecoverable errors

2022-11-13 Thread Ganesh Goudar

machine_check_log_err() is not getting called for all
unrecoverable errors, And we are missing to log the error.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.37.1

[PATCH v3] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-09-26 Thread Ganesh Goudar

Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode, to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
v2: Force inline few more functions.

v3: Adding noinstr to few functions instead of __always_inline.
---
 arch/powerpc/include/asm/hw_irq.h| 8 
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 983551859891..c4d542b4a623 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void)
 #ifdef CONFIG_PPC64
 #include 
 
-static inline notrace unsigned long irq_soft_mask_return(void)
+noinstr static unsigned long irq_soft_mask_return(void)
 {
unsigned long flags;
 
@@ -128,7 +128,7 @@ static inline notrace unsigned long 
irq_soft_mask_return(void)
  * for the critical section and as a clobber because
  * we changed paca->irq_soft_mask
  */
-static inline notrace void irq_soft_mask_set(unsigned long mask)
+noinstr static void irq_soft_mask_set(unsigned long mask)
 {
/*
 * The irq mask must always include the STD bit if any are set.
@@ -155,7 +155,7 @@ static inline notrace void irq_soft_mask_set(unsigned long 
mask)
: "memory");
 }
 
-static inline notrace unsigned long irq_soft_mask_set_return(unsigned long 
mask)
+noinstr static unsigned long irq_soft_mask_set_return(unsigned long mask)
 {
unsigned long flags;
 
@@ -191,7 +191,7 @@ static inline notrace unsigned long 
irq_soft_mask_or_return(unsigned long mask)
return flags;
 }
 
-static inline unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
 {
return irq_soft_mask_return();
 }
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1

Re: [PACTH v2] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-09-19 Thread Ganesh


On 9/7/22 09:49, Nicholas Piggin wrote:


On Mon Sep 5, 2022 at 4:38 PM AEST, Ganesh Goudar wrote:

Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode, to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar
---
v2: Force inline few more functions.
---
  arch/powerpc/include/asm/hw_irq.h| 8 
  arch/powerpc/include/asm/interrupt.h | 2 +-
  arch/powerpc/include/asm/rtas.h  | 4 ++--
  arch/powerpc/kernel/rtas.c   | 4 ++--
  4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 26ede09c521d..3264991fe524 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void)
  #ifdef CONFIG_PPC64
  #include 
  
-static inline notrace unsigned long irq_soft_mask_return(void)

+static __always_inline notrace unsigned long irq_soft_mask_return(void)
  {
return READ_ONCE(local_paca->irq_soft_mask);
  }
@@ -121,7 +121,7 @@ static inline notrace unsigned long 
irq_soft_mask_return(void)
   * for the critical section and as a clobber because
   * we changed paca->irq_soft_mask
   */
-static inline notrace void irq_soft_mask_set(unsigned long mask)
+static __always_inline notrace void irq_soft_mask_set(unsigned long mask)
  {
/*
 * The irq mask must always include the STD bit if any are set.

This doesn't give a reason why it's __always_inline, and having the
notrace attribute makes it possibly confusing. I think it would be easy
for someone to break without realising. Could you add a noinstr to these
instead / as well?


Yeah we can add noinstr. Missed to see your comment, Sorry for the delayed reply



What about adding a 'realmode' function annotation that includes noinstr?


You mean to define a new function annotation?

Re: [RFC 0/3] Asynchronous EEH recovery

2022-09-15 Thread Ganesh


On 9/2/22 05:49, Jason Gunthorpe wrote:


On Tue, Aug 16, 2022 at 08:57:13AM +0530, Ganesh Goudar wrote:

Hi,

EEH reocvery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

How did you test this?


This is tested on SRIOV VFs.



I understand that VFIO on 6.0 does not work at all on power?

I am waiting for power maintainers to pick up this series to fix it:

https://lore.kernel.org/kvm/20220714081822.3717693-1-...@ozlabs.ru/

Jason

[PACTH v2] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-09-05 Thread Ganesh Goudar

Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode, to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
v2: Force inline few more functions.
---
 arch/powerpc/include/asm/hw_irq.h| 8 
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 26ede09c521d..3264991fe524 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void)
 #ifdef CONFIG_PPC64
 #include 
 
-static inline notrace unsigned long irq_soft_mask_return(void)
+static __always_inline notrace unsigned long irq_soft_mask_return(void)
 {
return READ_ONCE(local_paca->irq_soft_mask);
 }
@@ -121,7 +121,7 @@ static inline notrace unsigned long 
irq_soft_mask_return(void)
  * for the critical section and as a clobber because
  * we changed paca->irq_soft_mask
  */
-static inline notrace void irq_soft_mask_set(unsigned long mask)
+static __always_inline notrace void irq_soft_mask_set(unsigned long mask)
 {
/*
 * The irq mask must always include the STD bit if any are set.
@@ -144,7 +144,7 @@ static inline notrace void irq_soft_mask_set(unsigned long 
mask)
barrier();
 }
 
-static inline notrace unsigned long irq_soft_mask_set_return(unsigned long 
mask)
+static __always_inline notrace unsigned long irq_soft_mask_set_return(unsigned 
long mask)
 {
unsigned long flags = irq_soft_mask_return();
 
@@ -162,7 +162,7 @@ static inline notrace unsigned long 
irq_soft_mask_or_return(unsigned long mask)
return flags;
 }
 
-static inline unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
 {
return irq_soft_mask_return();
 }
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1

[PATCH] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-08-29 Thread Ganesh Goudar

Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1

Re: [6.0-rc1] Kernel crash while running MCE tests

2022-08-22 Thread Ganesh


On 8/22/22 11:01, Sachin Sant wrote:




On 19-Aug-2022, at 10:12 AM, Ganesh  wrote

We'll have to make sure everything get_pseries_errorlog() is either
forced inline, or marked noinstr.


Making the following functions always_inline and noinstr is fixing the issue.
__always_inline pseries_errorlog_id()
__always_inline pseries_errorlog_length()
__always_inline do_enter_rtas()
__always_inline srr_regs_clobbered()
noinstr va_rtas_call_unlocked()

Shall I post the patch?

Yes, thanks. I can help with testing.


Sure, thanks.

Re: [6.0-rc1] Kernel crash while running MCE tests

2022-08-22 Thread Ganesh


On 8/22/22 11:19, Michael Ellerman wrote:


So I guess the compiler has decided not to inline it (why?!), and it is
not marked noinstr, so it gets KASAN instrumentation which crashes in
real mode.

We'll have to make sure everything get_pseries_errorlog() is either
forced inline, or marked noinstr.

Making the following functions always_inline and noinstr is fixing the issue.
__always_inline pseries_errorlog_id()
__always_inline pseries_errorlog_length()
__always_inline do_enter_rtas()
__always_inline srr_regs_clobbered()
noinstr va_rtas_call_unlocked()

Why do we need it? Because of fwnmi_release_errinfo()?


Yes.


Shall I post the patch?

Yeah.

cheers

Re: [6.0-rc1] Kernel crash while running MCE tests

2022-08-18 Thread Ganesh


On 8/17/22 11:28, Michael Ellerman wrote:


Sachin Sant  writes:

Following crash is seen while running powerpc/mce subtest on
a Power10 LPAR.

1..1
# selftests: powerpc/mce: inject-ra-err
[  155.240591] BUG: Unable to handle kernel data access on read at 
0xc00e00022d55b503
[  155.240618] Faulting instruction address: 0xc06f1f0c
[  155.240627] Oops: Kernel access of bad area, sig: 11 [#1]
[  155.240633] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
[  155.240642] Modules linked in: dm_mod mptcp_diag xsk_diag tcp_diag udp_diag 
raw_diag inet_diag unix_diag af_packet_diag netlink_diag nft_fib_inet 
nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 
nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 
nf_defrag_ipv4 bonding rfkill tls ip_set nf_tables nfnetlink sunrpc binfmt_misc 
pseries_rng drm drm_panel_orientation_quirks xfs libcrc32c sd_mod t10_pi sr_mod 
crc64_rocksoft_generic cdrom crc64_rocksoft crc64 sg ibmvscsi ibmveth 
scsi_transport_srp xts vmx_crypto fuse
[  155.240750] CPU: 4 PID: 3645 Comm: inject-ra-err Not tainted 6.0.0-rc1 #2
[  155.240761] NIP:  c06f1f0c LR: c00630d0 CTR: 
[  155.240768] REGS: c000ff887890 TRAP: 0300   Not tainted  (6.0.0-rc1)
[  155.240776] MSR:  80001003   CR: 48002828  XER: 

 ^
 MMU is off, aka. real mode.


[  155.240792] CFAR: c00630cc DAR: c00e00022d55b503 DSISR: 4000 
IRQMASK: 3
[  155.240792] GPR00: c00630d0 c000ff887b30 c44afe00 
c0116aada818
[  155.240792] GPR04: 4d43 0008 c00630d0 
004d4249
[  155.240792] GPR08: 0001 18022d55b503 a80e 
0348
[  155.240792] GPR12:  c000b700  

[  155.240792] GPR16:    

[  155.240792] GPR20:    
1b30
[  155.240792] GPR24: 7fff8dad 7fff8dacf6d8 7fffd1551e98 
1001fce8
[  155.240792] GPR28: c0116aada888 c0116aada800 4d43 
c0116aada818
[  155.240885] NIP [c06f1f0c] __asan_load2+0x5c/0xe0
[  155.240898] LR [c00630d0] pseries_errorlog_id+0x20/0x40
[  155.240910] Call Trace:
[  155.240914] [c000ff887b50] [c00630d0] 
pseries_errorlog_id+0x20/0x40
[  155.240925] [c000ff887b80] [c15595c8] 
get_pseries_errorlog+0xa8/0x110
  
get_pseries_errorlog() is marked noinstr.


And pseries_errorlog_id() is:

static
inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
{
return be16_to_cpu(sect->id);
}

So I guess the compiler has decided not to inline it (why?!), and it is
not marked noinstr, so it gets KASAN instrumentation which crashes in
real mode.

We'll have to make sure everything get_pseries_errorlog() is either
forced inline, or marked noinstr.


Making the following functions always_inline and noinstr is fixing the issue.
__always_inline pseries_errorlog_id()
__always_inline pseries_errorlog_length()
__always_inline do_enter_rtas()
__always_inline srr_regs_clobbered()
noinstr va_rtas_call_unlocked()

Shall I post the patch?

[RFC 3/3] powerpc/eeh: Asynchronous recovery

2022-08-15 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   3 +
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  65 +++---
 arch/powerpc/kernel/eeh_pe.c  |   3 +
 6 files changed, 288 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index f659c0433de5..2728aee5cb0b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index c85f901227c9..74806009f50a 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -131,6 +131,9 @@ struct pci_controller {
struct irq_domain   *dev_domain;
struct irq_domain   *msi_domain;
struct fwnode_handle*fwnode;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 894326cc4dfa..3abd5f2d146c 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id,
+unsigned int id,
 struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_pdev(unsigned int event_id,
-  struct pci_dev *pdev, eeh_report_fn fn,
+  unsigned int id,
+  struct pci_dev *pdev,
+  const char *fn_name, eeh_report_fn fn,
   enum pci_ers_result *result,
-  const char *handler_name)
+  bool late, bool removed, bool passed)
 {
-

[RFC 1/3] powerpc/eeh: Synchronization for safety

2022-08-15 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |   6 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 287 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_spapr_eeh.c|  10 +-
 10 files changed, 364 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..f659c0433de5 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_waiti, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..2c90c37524ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover;
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-/* Lock to avoid races due to multiple reports of an error */
+/*
+ * confirm_error_lock and eeh_dev_mutex are used together to provide
+ * safety during EEH operations.
+ *
+ * Generally, the spinlock is used in error detection where it's not possible
+ * to use a mutex or where there is potential to deadlock with the mutex, and
+ * the mutex is used during recovery and other PCI related operations. One must
+ * be held when reading and both must be held when making changes to the
+ * protected fields: eeh_pe.parent

[RFC 0/3] Asynchronous EEH recovery

2022-08-15 Thread Ganesh Goudar

Hi,

EEH reocvery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs and I see approximately 48% reduction
in time taken in EEH recovery, Yet to be tested on powernv.

These patches were originally posted as separate RFCs, I think
posting them as single series would be more helpful, I know the
patches are too big, I will try to logically divide in next
iterations.

Thanks 

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |   7 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   3 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 154 +++--
 arch/powerpc/kernel/eeh_driver.c | 578 +++
 arch/powerpc/kernel/eeh_event.c  |  71 ++-
 arch/powerpc/kernel/eeh_pe.c |  33 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_spapr_eeh.c|  10 +-
 14 files changed, 685 insertions(+), 212 deletions(-)

-- 
2.37.1

[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2022-08-15 Thread Ganesh Goudar

Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 188 ---
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 5 files changed, 146 insertions(+), 101 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index f6cf0159024e..42d175af33cb 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2c90c37524ed..148d5df0e606 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, );
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, );
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, );
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {
eeh_ops->read_config(edev, cap, 4, );
n += scnpri

Re: [PATCH v3 RESEND 1/3] powerpc/pseries: Parse control memory access error

2022-02-21 Thread Ganesh



On 1/7/22 19:44, Ganesh Goudar wrote:


Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar
---
  arch/powerpc/platforms/pseries/ras.c | 36 
  1 file changed, 32 insertions(+), 4 deletions(-)


mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;


Hi mpe, Any comments on this patch series?

[PATCH v5] powerpc/mce: Avoid using irq_work_queue() in realmode

2022-01-20 Thread Ganesh Goudar

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.

V4:
* Rename some functions and variables
* Remove mces_to_process counter and add a flag to indicate
  there is a mce info to process.

V5:
* Fix the build warning, reported by kernel test robot.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   | 13 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 60 +---
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 32 +
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index e821037f74f0..36d2f34aa352 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..c9f0936bd3c9 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void mce_irq_work_queue(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_run_irq_context_handlers(void);
+#else
+static inline void mce_run_irq_context_handlers(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void set_mce_pending_irq_work(void);
+void clear_mce_pending_irq_work(void);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 295573a82c66..8330968ca346 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -288,6 +288,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u8 mce_pending_irq_work;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 2503dd4713b9..6cd4b1409874 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+void mce_irq_work_queue(void)
+{
+   /* Raise decrementer interrupt */
+   arch_irq_work_raise();
+   set_mce_pending_irq_work();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -217,7 +214,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(_ue_event_work);
 }
@@ -239,7 +236,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this e

Re: [PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code

2022-01-17 Thread Ganesh


On 11/24/21 18:40, Nicholas Piggin wrote:


Excerpts from Ganesh Goudar's message of November 24, 2021 7:55 pm:

Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar
---
  arch/powerpc/platforms/pseries/ras.c | 122 +++
  1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
  }
  
-static int mce_handle_err_realmode(int disposition, u8 error_type)

-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
  {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
  
-	if (!mce_log)

-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
  

What's this hunk for?


@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
  
-	switch (error_type) {

+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, _err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_

Re: [PATCH v3 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2022-01-17 Thread Ganesh


On 11/24/21 18:33, Nicholas Piggin wrote:


Excerpts from Ganesh Goudar's message of November 24, 2021 7:54 pm:

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
   Reported by kernel test bot.
---
  arch/powerpc/include/asm/machdep.h   |  2 +
  arch/powerpc/include/asm/mce.h   |  2 +
  arch/powerpc/include/asm/paca.h  |  1 +
  arch/powerpc/kernel/mce.c| 51 +++-
  arch/powerpc/kernel/time.c   |  3 ++
  arch/powerpc/platforms/pseries/pseries.h |  1 +
  arch/powerpc/platforms/pseries/ras.c | 31 +-
  arch/powerpc/platforms/pseries/setup.c   |  1 +
  8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
  
+	void(*machine_check_log_err)(void);

+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
  extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
  int mce_register_notifier(struct notifier_block *nb);
  int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
  #ifdef CONFIG_PPC_BOOK3S_64
  void flush_and_reload_slb(void);
  void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
  #endif
  #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u32 mces_to_process;
  #endif /* CONFIG_PPC_BOOK3S_64 */
  } cacheline_aligned;
  
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c

index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
  
  #include "setup.h"
  
-static void machine_check_process_queued_event(struct irq_work *work);

-static void machine_check_ue_irq_work(struct irq_work *work);
  static void machine_check_ue_event(struct machine_check_event *evt);
  static void machine_process_ue_event(struct work_struct *work);
  
-static struct irq_work mce_event_process_work = {

-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
  static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
  
  static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);

@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
  }
  
+/* Raise decrementer interrupt */

+void machine_check_raise_dec_intr(void)
+{
+   arch_irq_work_raise();
+}

It would be better if the name specifically related to irq work, which
is more than just dec interrupt. It might be good to set mces_to_process
here as well.


Sure



I would name it something like mce_irq_work_queue, and the paca variable
to mce_pending_irq_work...


Ok





+void mce_run_late_handlers(void)
+{
+   if (unlikely(local_paca->mces_to_process)) {
+   if (ppc_md.machine_check_log_err)
+   ppc_md.machine_check_log_err();
+   machine_check_process_queued_event();
+   machine_check_ue_work();
+   local_paca->mces_to_process--;
+   }
+}

The problem with a counter is that you're clearing the irq work pending
in the timer interrupt, so you'll never call in here again to clear that
(until something else sets irq work).

But as far as I can see it does not need to be a counter, just a flag.
The machine check ca

[PATCH v4] powerpc/mce: Avoid using irq_work_queue() in realmode

2022-01-17 Thread Ganesh Goudar

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.

V4:
* Rename some functions and variables
* Remove mces_to_process counter and add a flag to indicate
  there is a mce info to process.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   | 13 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 60 +---
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +---
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..c9f0936bd3c9 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void mce_irq_work_queue(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_run_irq_context_handlers(void);
+#else
+static inline void mce_run_irq_context_handlers(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void set_mce_pending_irq_work(void);
+void clear_mce_pending_irq_work(void);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..963030689cfa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u8 mce_pending_irq_work;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..6af798803ece 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+void mce_irq_work_queue(void)
+{
+   /* Raise decrementer interrupt */
+   arch_irq_work_raise();
+   set_mce_pending_irq_work();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -217,7 +214,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(_ue_event_work);
 }
@@ -239,7 +236,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(_ue_event_irq_work);
+   mce_irq_

[PATCH v3 RESEND 3/3] powerpc/mce: Modify the real address error logging messages

2022-01-07 Thread Ganesh Goudar

To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..55ccc651d1b0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -401,14 +401,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1

[PATCH v3 RESEND 1/3] powerpc/pseries: Parse control memory access error

2022-01-07 Thread Ganesh Goudar

Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 36 
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..e62a0ca2611a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -60,11 +60,17 @@ struct pseries_mc_errorlog {
 *  XX  2: Reserved.
 *XXX   3: Type of UE error.
 *
-* For error_type != MC_ERROR_TYPE_UE
+* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB
 *   
 *   X  1: Effective address provided.
 *X 5: Reserved.
 * XX   2: Type of SLB/ERAT/TLB error.
+*
+* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS
+*   
+*   X  1: Error causing address provided.
+*XXX   3: Type of error.
+*      4: Reserved.
 */
u8  sub_err_type;
u8  reserved_1[6];
@@ -80,6 +86,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -90,6 +97,7 @@ struct pseries_mc_errorlog {
 
 #define UE_EFFECTIVE_ADDR_PROVIDED 0x40
 #define UE_LOGICAL_ADDR_PROVIDED   0x20
+#define MC_EFFECTIVE_ADDR_PROVIDED 0x80
 
 #define MC_ERROR_SLB_PARITY0
 #define MC_ERROR_SLB_MULTIHIT  1
@@ -103,6 +111,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_ERAT:
@@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.erat_error_type = 
MCE_ERAT_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_TLB:
@@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_D_CACHE:
@@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   break;

[PATCH v3 RESEND 2/3] selftests/powerpc: Add test for real address error handling

2022-01-07 Thread Ganesh Goudar

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  7 ++
 .../selftests/powerpc/mce/inject-ra-err.c | 65 +++
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..2424513982d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,7 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_GEN_PROGS := inject-ra-err
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..94323c34d9a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas-api.h"
+#include "utils.h"
+
+static bool faulted;
+
+static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+   struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+   faulted = true;
+   regs->nip += 4;
+}
+
+static int test_ra_error(void)
+{
+   struct vas_tx_win_open_attr attr;
+   int fd, *paste_addr;
+   char *devname = "/dev/crypto/nx-gzip";
+   struct sigaction act = {
+   .sa_sigaction = sigbus_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   memset(, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   SKIP_IF(access(devname, F_OK));
+
+   fd = open(devname, O_RDWR);
+   FAIL_IF(fd < 0);
+   FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0);
+   FAIL_IF(sigaction(SIGBUS, , NULL) != 0);
+
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+
+   /* The following assignment triggers exception */
+   mb();
+   *paste_addr = 1;
+   mb();
+
+   FAIL_IF(!faulted);
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(test_ra_error, "inject-ra-err");
+}
+
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1

[PATCH v3 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-24 Thread Ganesh Goudar

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u32 mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   arch_irq_work_raise();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   local_paca->mces_to_process++;
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
-

[PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-24 Thread Ganesh Goudar

Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, _err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case

[PATCH v2 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-23 Thread Ganesh Goudar

Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, _err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case

[PATCH v2 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-23 Thread Ganesh Goudar

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 33 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u32 mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   arch_irq_work_raise();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   local_paca->mces_to_process++;
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
-   unsigned long msr;
 
if (!get_mce_eve

Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-18 Thread Ganesh


On 11/8/21 19:49, Nicholas Piggin wrote:


Excerpts from Ganesh Goudar's message of November 8, 2021 6:38 pm:

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar
---
  arch/powerpc/include/asm/machdep.h   |  2 +
  arch/powerpc/include/asm/mce.h   |  2 +
  arch/powerpc/include/asm/paca.h  |  1 +
  arch/powerpc/kernel/mce.c| 51 +++-
  arch/powerpc/kernel/time.c   |  3 ++
  arch/powerpc/platforms/pseries/pseries.h |  1 +
  arch/powerpc/platforms/pseries/ras.c | 31 +-
  arch/powerpc/platforms/pseries/setup.c   |  1 +
  8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 764f2732a821..c89cc03c0f97 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -103,6 +103,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
  
+	void(*machine_check_log_err)(void);

+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..187810f13669 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
  extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+extern void machine_check_raise_dec_intr(void);

No new externs on function declarations, they tell me.


ok.


  int mce_register_notifier(struct notifier_block *nb);
  int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
  #ifdef CONFIG_PPC_BOOK3S_64
  void flush_and_reload_slb(void);
  void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..f49180f8c9be 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
  #endif
  #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   atomic_t mces_to_process;
  #endif /* CONFIG_PPC_BOOK3S_64 */
  } cacheline_aligned;
  
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c

index fd829f7f25a4..45baa062ebc0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
  
  #include "setup.h"
  
-static void machine_check_process_queued_event(struct irq_work *work);

-static void machine_check_ue_irq_work(struct irq_work *work);
  static void machine_check_ue_event(struct machine_check_event *evt);
  static void machine_process_ue_event(struct work_struct *work);
  
-static struct irq_work mce_event_process_work = {

-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
  static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
  
  static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);

@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
  }
  
+/* Raise decrementer interrupt */

+void machine_check_raise_dec_intr(void)
+{
+   set_dec(1);
+}

The problem here is a timer can be scheduled (e.g., by an external
interrupt if it gets taken before the decrementer, then uses a
timer) and that set decr > 1. See logic in decrementer_set_next_event.

I _think_ the way to get around this would be to have the machine check
just use arch_irq_work_raise.

Then you could also only call the mce handler inside the
test_irq_work_pending() check and avoid the added function call on every
timer. That test should also be marked unlikely come to think of it, but
that's a side patchlet.


Sure, I will use arch_irq_work_raise() and test_irq_work_pending().




+
  /*
   * Decode and save high level MCE information into per cpu buffer which
   * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
  
+	atomic_inc(_paca->mces_to_proce

Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-18 Thread Ganesh

ount. So, consider the following sequence
of events:

1. Take MCE 1. Save to queue, increment mce_queue_count, increment
mces_to_process, set decrementer to fire.

2. Decrementer fires. mce_run_late_handlers is called.

3. mces_to_process = 1, so we call machine_check_log_err(), which prints
(on pseries) the info for MCE 1.

4. Take MCE 2. This is saved to the queue, mce_queue_count is
incremented, mces_to_process is incremented, and the decrementer is
armed again.

5. We then leave the MCE interrupt context and return to the decrementer
handling context. The next thing we do is we call
m_c_e_process_queued_event(), which clears the entire queue (that is,
MCEs 1 and 2):

while (local_paca->mce_info->mce_queue_count > 0) {
index = local_paca->mce_info->mce_queue_count - 1;
evt = _paca->mce_info->mce_event_queue[index];

if (evt->error_type == MCE_ERROR_TYPE_UE &&
evt->u.ue_error.ignore_event) {
local_paca->mce_info->mce_queue_count--;
continue;
}
machine_check_print_event_info(evt, false, false);
local_paca->mce_info->mce_queue_count--;
}

  6. We finish mce_run_late_handlers() and decrement mces_to_process,
 so it's now 1.

  7. The decrementer fires again, mces_to_process is 1, so we start
 processing again.

  8. We call machine_check_log_err again, it will now call the FWNMI code
 again and possibly print error 2.

  9. process_queued_event will be called again but mce_queue_count will
 be 0 so it it will bail out early.

I _think_ the worst that can happen - at least so long as pseries is the
only implementaion of machine_check_log_err - is that we will handle
MCE 2 before we query the firmware about it. That's probably benign, but
I am still concerned with the overall interaction around nested
interrupts.


The only problem we have here is overwriting mce_data_buf in case of nested
mce, and about "handle MCE 2 before we query the firmware about it" It is not
possible, isn't it?

Assume we take MCE 2 while we are in the middle of mce_run_late_handlers(),
before the MCE handler relinquishes the CPU to timer handler, we will have
everything in place, right? or am I missing something obvious.


  void machine_check_print_event_info(struct machine_check_event *evt,
bool user_mode, bool in_guest)
  {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 934d8ae66cc6..2dc09d75d77c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
irq_work_run();
}
  
+#ifdef CONFIG_PPC_BOOK3S_64

+   mce_run_late_handlers();
+#endif


So we're now branching to a function in a different file and doing an
atomic read in every timer interrupt. Is this a hot path? Is there any
speed implication to doing this?


Nick has suggested me to use test_irq_work_pending() and I will remove the
atomic read, with v2 we may not have any serious time implications.


now = get_tb();
if (now >= *next_tb) {
*next_tb = ~(u64)0;



@@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
error_type = mce_log->error_type;
  
  	disposition = mce_handle_err_realmode(disposition, error_type);

-
-   /*
-* Enable translation as we will be accessing per-cpu variables
-* in save_mce_event() which may fall outside RMO region, also
-* leave it enabled because subsequently we will be queuing work
-* to workqueues where again per-cpu variables accessed, besides
-* fwnmi_release_errinfo() crashes when called in realmode on
-* pseries.
-* Note: All the realmode handling like flushing SLB entries for
-*   SLB multihit is done by now.
-*/
  out:
-   msr = mfmsr();
-   mtmsr(msr | MSR_IR | MSR_DR);
-
disposition = mce_handle_err_virtmode(regs, errp, mce_log,
  disposition);

Now you are not in virtual mode/translations on when you are calling
mce_handle_err_virtmode(). From the name, I thought that
mce_handle_err_virtmode() would assume that you are in virtual mode?
Does the function assume that? If so is it safe to call it in real mode?
If not, should we rename it as part of this patch?


patch 2/2, refactors this.


-
-   /*
-* Queue irq work to log this rtas event later.
-* irq_work_queue uses per-cpu variables, so do this in virt
-* mode as well.
-*/
-   irq_work_queue(_errlog_process_work);
-
-   mtmsr(msr);
-
return disposition;
  }


Thanks for the review :) .
Ganesh

Re: [PATCH v3 1/3] powerpc/pseries: Parse control memory access error

2021-11-08 Thread Ganesh


On 9/6/21 14:13, Ganesh Goudar wrote:


Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar
---
v3: Modify the commit log to mention the document according
 to which changes are made.
 Define and use a macro to check if the effective address
 is provided.

v2: No changes.
---
  arch/powerpc/platforms/pseries/ras.c | 36 
  1 file changed, 32 insertions(+), 4 deletions(-)


Hi mpe, Any comments on this patch series?

[PATCH 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-08 Thread Ganesh Goudar

Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, _err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case

[PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-08 Thread Ganesh Goudar

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 764f2732a821..c89cc03c0f97 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -103,6 +103,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..187810f13669 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+extern void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..f49180f8c9be 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   atomic_t mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..45baa062ebc0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   set_dec(1);
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   atomic_inc(_paca->mces_to_process);
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
-   unsigned long msr;
 
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;
@@ -263,20 +260,7 @@ void machine_check_queue_even

Re: [PATCH v1] powerpc/64s: Fix unrecoverable MCE crash

2021-09-23 Thread Ganesh


On 9/22/21 7:32 AM, Nicholas Piggin wrote:


The machine check handler is not considered NMI on 64s. The early
handler is the true NMI handler, and then it schedules the
machine_check_exception handler to run when interrupts are enabled.

This works fine except the case of an unrecoverable MCE, where the true
NMI is taken when MSR[RI] is clear, it can not recover to schedule the
next handler, so it calls machine_check_exception directly so something
might be done about it.

Calling an async handler from NMI context can result in irq state and
other things getting corrupted. This can also trigger the BUG at
arch/powerpc/include/asm/interrupt.h:168.

Fix this by just making the 64s machine_check_exception handler an NMI
like it is on other subarchs.

Signed-off-by: Nicholas Piggin 
---


Hi Nick,

If I inject control memory access error in LPAR on top of this patch
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210906084303.183921-1-ganes...@linux.ibm.com/

I see the following warning trace

WARNING: CPU: 130 PID: 7122 at arch/powerpc/include/asm/interrupt.h:319 
machine_check_exception+0x310/0x340
 Modules linked in:
 CPU: 130 PID: 7122 Comm: inj_access_err Kdump: loaded Tainted: G   M   
   5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty #22
 NIP:  c002f980 LR: c002f7e8 CTR: c0a31860
 REGS: c039fe51bb20 TRAP: 0700   Tainted: G   M   
(5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty)
 MSR:  80029033   CR: 88000222  XER: 2004
 CFAR: c002f844 IRQMASK: 0
 GPR00: c002f798 c039fe51bdc0 c20d 0001
 GPR04:  4002 4000 19af
 GPR08: 0077e5ad  c077ee16c700 0080
 GPR12: 88000222 c077ee16c700  
 GPR16:    
 GPR20:    
 GPR24:   c20fecd8 
 GPR28:  0001 0001 c039fe51be80
 NIP [c002f980] machine_check_exception+0x310/0x340
 LR [c002f7e8] machine_check_exception+0x178/0x340
 Call Trace:
 [c039fe51bdc0] [c002f798] machine_check_exception+0x128/0x340 
(unreliable)
 [c039fe51be10] [c00086ec] machine_check_common+0x1ac/0x1b0
 --- interrupt: 200 at 0x1968
 NIP:  1968 LR: 1958 CTR: 
 REGS: c039fe51be80 TRAP: 0200   Tainted: G   M   
(5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty)
 MSR:  82a0f033   CR: 22000824  
XER: 
 CFAR: 021c DAR: 7fffb00c DSISR: 0208 IRQMASK: 0
 GPR00: 22000824 7fffc9647770 10027f00 7fffb00c
 GPR04:    
 GPR08:  7fffb00c 0001 
 GPR12:  7fffb015a330  
 GPR16:    
 GPR20:    
 GPR24:    185c
 GPR28: 7fffc9647d18 0001 19b0 7fffc9647770
 NIP [1968] 0x1968
 LR [1958] 0x1958
 --- interrupt: 200

Re: [PATCH] powerpc/mce: check if event info is valid

2021-09-17 Thread Ganesh


On 8/6/21 6:53 PM, Ganesh Goudar wrote:


Check if the event info is valid before printing the
event information. When a fwnmi enabled nested kvm guest
hits a machine check exception L0 and L2 would generate
machine check event info, But L1 would not generate any
machine check event info as it won't go through 0x200
vector and prints some unwanted message.

To fix this, 'in_use' variable in machine check event info is
no more in use, rename it to 'valid' and check if the event
information is valid before logging the event information.

without this patch L1 would print following message for
exceptions encountered in L2, as event structure will be
empty in L1.

"Machine Check Exception, Unknown event version 0".

Signed-off-by: Ganesh Goudar 
---


Hi mpe, Any comments on this patch.

Re: [PATCH v2] powerpc/mce: Fix access error in mce handler

2021-09-17 Thread Ganesh


On 9/17/21 12:09 PM, Daniel Axtens wrote:


Hi Ganesh,


We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

After enabling translation in mce_handle_error() we used to
leave it enabled to avoid crashing here, but now with the
commit 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before
returning from handler") we are restoring the MSR to disable
translation.

Hence to fix this enable the translation before queuing the work.

[snip]


Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")

That patch changes arch/powerpc/powerpc/platforms/pseries/ras.c just
below this comment:

 /*
  * Enable translation as we will be accessing per-cpu variables
  * in save_mce_event() which may fall outside RMO region, also
  * leave it enabled because subsequently we will be queuing work
  * to workqueues where again per-cpu variables accessed, besides
  * fwnmi_release_errinfo() crashes when called in realmode on
  * pseries.
  * Note: All the realmode handling like flushing SLB entries for
  *   SLB multihit is done by now.
  */

That suggests per-cpu variables need protection. In your patch, you
enable translations just around irq_work_queue:


The comment is bit old, most of it doesn't make any sense now, yes per-cpu
variables cannot be accessed in realmode, but with commit 923b3cf00b3f
("powerpc/mce: Remove per cpu variables from MCE handlers") we moved all of
them to paca.


+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(_event_process_work);
+   }

However, just before that in the function, there are a few things that
access per-cpu variables via the local_paca, e.g.:

memcpy(_paca->mce_info->mce_event_queue[index],
, sizeof(evt));

Do we need to widen the window where translations are enabled in order
to protect accesses to local_paca?


paca will be within Real Mode Area, so it can be accessed with translate off.

[PATCH v2] powerpc/mce: Fix access error in mce handler

2021-09-09 Thread Ganesh Goudar

We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

After enabling translation in mce_handle_error() we used to
leave it enabled to avoid crashing here, but now with the
commit 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before
returning from handler") we are restoring the MSR to disable
translation.

Hence to fix this enable the translation before queuing the work.

Without this change following trace is seen on injecting SLB
multihit in an LPAR running with hash mmu.

Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")
Signed-off-by: Ganesh Goudar 
---
v2: Change in commit message.
---
 arch/powerpc/kernel/mce.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
+   unsigned long msr;
 
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(_paca->mce_info->mce_event_queue[index],
   , sizeof(evt));
 
-   /* Queue irq work to process this event later. */
-   irq_work_queue(_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(_event_process_work);
+   }
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
-- 
2.31.1

Re: [PATCH] powerpc/mce: Fix access error in mce handler

2021-09-08 Thread Ganesh


On 9/8/21 11:10 AM, Michael Ellerman wrote:


Ganesh  writes:

On 9/6/21 6:03 PM, Michael Ellerman wrote:


Ganesh Goudar  writes

Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")

Please explain in more detail why that commit caused this breakage.

After enabling translation in mce_handle_error() we used to leave it enabled to 
avoid
crashing here, but now with this commit we are restoring the MSR to disable 
translation.

Are you sure we left the MMU enabled to avoid crashing there, or we just
left it enabled by accident?


No, I think we left it enabled intentionally, I mentioned about leaving it 
enabled
in my comment and commit message of a95a0a1654 "powerpc/pseries: Fix MCE handling on 
pseries".



But yeah, previously the MMU was enabled when we got here whereas now
it's not, because of that change.


Missed to mention it in commit log, I will add it.

Thanks.


diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
   {
int index;
struct machine_check_event evt;
+   unsigned long msr;
   
   	if (!get_mce_event(, MCE_EVENT_RELEASE))

return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(_paca->mce_info->mce_event_queue[index],
   , sizeof(evt));
   
-	/* Queue irq work to process this event later. */

-   irq_work_queue(_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(_event_process_work);
+   }
   }

We already went to virtual mode and queued (different) irq work in
arch/powerpc/platforms/pseries/ras.c:mce_handle_error()

We also called save_mce_event() which also might have queued irq work,
via machine_check_ue_event().

So it really feels like something about the design is wrong if we have
to go to virtual mode again and queue more irq work here.

I guess we can probably merge this as a backportable fix, doing anything
else would be a bigger change.

I agree.


Looking at ras.c there's the comment:

 * Enable translation as we will be accessing per-cpu variables
 * in save_mce_event() which may fall outside RMO region, also

But AFAICS it's only irq_work_queue() that touches anything percpu?

Yeah, we left the comment unchanged after doing some modifications around it,
It needs to be updated, ill send a separate patch for it.

Thanks.

I see some other comments that look out of date, ie. the one above
machine_check_process_queued_event() mentions syscall exit, which is no
longer true.


ill take care of it.



There's also comments in pseries/ras.c about fwnmi_release_errinfo()
crashing in real mode, but we call it in real mode now so that must be
fixed?


Yes, it is fixed now.




So maybe we should just not be using irq_work_queue(). It's a pretty
thin wrapper around set_dec(1), perhaps we just need to hand-roll some
real-mode friendly way of doing that.

You mean, ha

Re: [PATCH] powerpc/mce: Fix access error in mce handler

2021-09-07 Thread Ganesh



On 9/6/21 6:03 PM, Michael Ellerman wrote:


Ganesh Goudar  writes:

We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

So enable the translation before queuing the work.

Without this change following trace is seen on injecting machine
check error in an LPAR running with hash mmu.

What type of error are you injecting?


SLB multihit in kernel mode.




Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")

Please explain in more detail why that commit caused this breakage.


After enabling translation in mce_handle_error() we used to leave it enabled to 
avoid
crashing here, but now with this commit we are restoring the MSR to disable 
translation.
Missed to mention it in commit log, I will add it.




diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
  {
int index;
struct machine_check_event evt;
+   unsigned long msr;
  
  	if (!get_mce_event(, MCE_EVENT_RELEASE))

return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(_paca->mce_info->mce_event_queue[index],
   , sizeof(evt));
  
-	/* Queue irq work to process this event later. */

-   irq_work_queue(_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(_event_process_work);
+   }
  }

We already went to virtual mode and queued (different) irq work in
arch/powerpc/platforms/pseries/ras.c:mce_handle_error()

We also called save_mce_event() which also might have queued irq work,
via machine_check_ue_event().

So it really feels like something about the design is wrong if we have
to go to virtual mode again and queue more irq work here.

I guess we can probably merge this as a backportable fix, doing anything
else would be a bigger change.


I agree.



Looking at ras.c there's the comment:

 * Enable translation as we will be accessing per-cpu variables
 * in save_mce_event() which may fall outside RMO region, also

But AFAICS it's only irq_work_queue() that touches anything percpu?


Yeah, we left the comment unchanged after doing some modifications around it,
It needs to be updated, ill send a separate patch for it.



So maybe we should just not be using irq_work_queue(). It's a pretty
thin wrapper around set_dec(1), perhaps we just need to hand-roll some
real-mode friendly way of doing that.


You mean, have separate queue and run the work from timer handler?



cheers

[PATCH v3 3/3] powerpc/mce: Modify the real address error logging messages

2021-09-06 Thread Ganesh Goudar

To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
v3: No changes.

v2: No changes.
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9d1e39d42e3e..5baf69503349 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -400,14 +400,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1

[PATCH v3 2/3] selftests/powerpc: Add test for real address error handling

2021-09-06 Thread Ganesh Goudar

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
v3: Avoid using shell script to inject error.

v2: Fix build error.
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  7 ++
 .../selftests/powerpc/mce/inject-ra-err.c | 65 +++
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..2424513982d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,7 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_GEN_PROGS := inject-ra-err
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..94323c34d9a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas-api.h"
+#include "utils.h"
+
+static bool faulted;
+
+static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+   struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+   faulted = true;
+   regs->nip += 4;
+}
+
+static int test_ra_error(void)
+{
+   struct vas_tx_win_open_attr attr;
+   int fd, *paste_addr;
+   char *devname = "/dev/crypto/nx-gzip";
+   struct sigaction act = {
+   .sa_sigaction = sigbus_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   memset(, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   SKIP_IF(access(devname, F_OK));
+
+   fd = open(devname, O_RDWR);
+   FAIL_IF(fd < 0);
+   FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0);
+   FAIL_IF(sigaction(SIGBUS, , NULL) != 0);
+
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+
+   /* The following assignment triggers exception */
+   mb();
+   *paste_addr = 1;
+   mb();
+
+   FAIL_IF(!faulted);
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(test_ra_error, "inject-ra-err");
+}
+
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1

[PATCH v3 1/3] powerpc/pseries: Parse control memory access error

2021-09-06 Thread Ganesh Goudar

Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar 
---
v3: Modify the commit log to mention the document according
to which changes are made.
Define and use a macro to check if the effective address
is provided.

v2: No changes.
---
 arch/powerpc/platforms/pseries/ras.c | 36 
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..e62a0ca2611a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -60,11 +60,17 @@ struct pseries_mc_errorlog {
 *  XX  2: Reserved.
 *XXX   3: Type of UE error.
 *
-* For error_type != MC_ERROR_TYPE_UE
+* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB
 *   
 *   X  1: Effective address provided.
 *X 5: Reserved.
 * XX   2: Type of SLB/ERAT/TLB error.
+*
+* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS
+*   
+*   X  1: Error causing address provided.
+*XXX   3: Type of error.
+*      4: Reserved.
 */
u8  sub_err_type;
u8  reserved_1[6];
@@ -80,6 +86,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -90,6 +97,7 @@ struct pseries_mc_errorlog {
 
 #define UE_EFFECTIVE_ADDR_PROVIDED 0x40
 #define UE_LOGICAL_ADDR_PROVIDED   0x20
+#define MC_EFFECTIVE_ADDR_PROVIDED 0x80
 
 #define MC_ERROR_SLB_PARITY0
 #define MC_ERROR_SLB_MULTIHIT  1
@@ -103,6 +111,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_ERAT:
@@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.erat_error_type = 
MCE_ERAT_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_TLB:
@@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_D_CACHE:
@@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+

[PATCH] powerpc/mce: Fix access error in mce handler

2021-09-06 Thread Ganesh Goudar

We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

So enable the translation before queuing the work.

Without this change following trace is seen on injecting machine
check error in an LPAR running with hash mmu.

Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
+   unsigned long msr;
 
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(_paca->mce_info->mce_event_queue[index],
   , sizeof(evt));
 
-   /* Queue irq work to process this event later. */
-   irq_work_queue(_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(_event_process_work);
+   }
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
-- 
2.31.1

Re: [PATCH v2 2/3] selftests/powerpc: Add test for real address error handling

2021-08-26 Thread Ganesh



On 8/26/21 8:57 AM, Michael Ellerman wrote:

Ganesh  writes:

On 8/24/21 6:18 PM, Michael Ellerman wrote:


Ganesh Goudar  writes:

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

...


diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..3633cdc651a1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Real address or Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Real address or Control memory access error is handled"
+exit 0

I don't think we really need the shell script, we should be able to do
all that in the C code.

Can you try this?

it works!, We need to set timeout, with 120 sec timeout we may flood the dmesg.

Hmm. Does it keep faulting? The regs->nip += 4 is meant to avoid that.


Yes, it keeps faulting, if we fail to handle and not send SIGBUS to the process.



cheers

Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error

2021-08-25 Thread Ganesh



On 8/25/21 2:54 AM, Segher Boessenkool wrote:

On Tue, Aug 24, 2021 at 04:39:57PM +1000, Michael Ellerman wrote:

+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;

That name is ridiculously long, but I guess that's not your fault :)
We can fix it up in a later patch.

It also has surprisingly little information content for the 47 chars
length it has :-)  What does this even mean?!


It means control memory access error/real address error is detected during page
table walk.



Segher

Re: [PATCH v2 2/3] selftests/powerpc: Add test for real address error handling

2021-08-25 Thread Ganesh


On 8/24/21 6:18 PM, Michael Ellerman wrote:


Ganesh Goudar  writes:

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

...


diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..3633cdc651a1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Real address or Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Real address or Control memory access error is handled"
+exit 0

I don't think we really need the shell script, we should be able to do
all that in the C code.

Can you try this?


it works!, We need to set timeout, with 120 sec timeout we may flood the dmesg.
Thanks.



cheers

diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..2424513982d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,7 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_GEN_PROGS := inject-ra-err
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..ba0f9c28f786
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas-api.h"
+#include "utils.h"
+
+static bool faulted;
+
+static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+   struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+   faulted = true;
+   regs->nip += 4;
+}
+
+static int test_ra_error(void)
+{
+   struct vas_tx_win_open_attr attr;
+   int fd, *paste_addr;
+   char *devname = "/dev/crypto/nx-gzip";
+   struct sigaction act = {
+   .sa_sigaction = sigbus_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   memset(, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   SKIP_IF(!access(devname, F_OK));
+
+   fd = open(devname, O_RDWR);
+   FAIL_IF(fd < 0);
+   FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0);
+   FAIL_IF(sigaction(SIGBUS, , NULL) != 0);
+
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+
+   /* The following assignment triggers exception */
+   mb();
+   *paste_addr = 1;
+   mb();
+
+   FAIL_IF(!faulted);
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(test_ra_error, "inject-ra-err");
+}

Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error

2021-08-25 Thread Ganesh



On 8/24/21 12:09 PM, Michael Ellerman wrote:


Hi Ganesh,

Some comments below ...

Ganesh Goudar  writes:

Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
  arch/powerpc/platforms/pseries/ras.c | 21 +
  1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
  #define MC_ERROR_TYPE_TLB 0x04
  #define MC_ERROR_TYPE_D_CACHE 0x05
  #define MC_ERROR_TYPE_I_CACHE 0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08

...
  
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK	0

+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1


Where do the above values come from?


It is from latest PAPR that added support for control memory error.


+
  static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog 
*mlog)
  {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;

Can you add to the comment above sub_err_type explaining what these bits are.


Sure, for other errors it is explained in pseries_mc_errorlog definition, ill 
add it there.


default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)

This appears many times in the file.

Can we add eg. MC_EFFECTIVE_ADDR_PROVIDED?


ok, thanks.


+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;

That name is ridiculously long, but I guess that's not your fault :)
We can fix it up in a later patch.


+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;

cheers

Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error

2021-08-23 Thread Ganesh


Hi mpe, Any comments on this patchset?

On 8/5/21 2:50 PM, Ganesh Goudar wrote:


Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
  arch/powerpc/platforms/pseries/ras.c | 21 +
  1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
  #define MC_ERROR_TYPE_TLB 0x04
  #define MC_ERROR_TYPE_D_CACHE 0x05
  #define MC_ERROR_TYPE_I_CACHE 0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
  
  /* RTAS pseries MCE error sub types */

  #define MC_ERROR_UE_INDETERMINATE 0
@@ -103,6 +104,9 @@ struct pseries_mc_errorlog {
  #define MC_ERROR_TLB_MULTIHIT 2
  #define MC_ERROR_TLB_INDETERMINATE3
  
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK	0

+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
  static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog 
*mlog)
  {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;
case MC_ERROR_TYPE_UNKNOWN:
default:
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;

[PATCH] powerpc/mce: check if event info is valid

2021-08-06 Thread Ganesh Goudar

Check if the event info is valid before printing the
event information. When a fwnmi enabled nested kvm guest
hits a machine check exception L0 and L2 would generate
machine check event info, But L1 would not generate any
machine check event info as it won't go through 0x200
vector and prints some unwanted message.

To fix this, 'in_use' variable in machine check event info is
no more in use, rename it to 'valid' and check if the event
information is valid before logging the event information.

without this patch L1 would print following message for
exceptions encountered in L2, as event structure will be
empty in L1.

"Machine Check Exception, Unknown event version 0".

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h | 2 +-
 arch/powerpc/kernel/mce.c  | 7 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..3646f53f228f 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -113,7 +113,7 @@ enum MCE_LinkErrorType {
 
 struct machine_check_event {
enum MCE_Versionversion:8;
-   u8  in_use;
+   u8  valid;
enum MCE_Severity   severity:8;
enum MCE_Initiator  initiator:8;
enum MCE_ErrorType  error_type:8;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..b778394a06b5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -114,7 +114,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->srr0 = nip;
mce->srr1 = regs->msr;
mce->gpr3 = regs->gpr[3];
-   mce->in_use = 1;
+   mce->valid = 1;
mce->cpu = get_paca()->paca_index;
 
/* Mark it recovered if we have handled it and MSR(RI=1). */
@@ -202,7 +202,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
if (mce)
*mce = *mc_evt;
if (release)
-   mc_evt->in_use = 0;
+   mc_evt->valid = 0;
ret = 1;
}
/* Decrement the count to free the slot. */
@@ -413,6 +413,9 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
"Probable Software error (some chance of hardware cause)",
};
 
+   if (!evt->valid)
+   return;
+
/* Print things out */
if (evt->version != MCE_V1) {
pr_err("Machine Check Exception, Unknown event version %d !\n",
-- 
2.31.1

[PATCH v2 3/3] powerpc/mce: Modify the real address error logging messages

2021-08-05 Thread Ganesh Goudar

To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..f3ef480bb739 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -388,14 +388,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1

[PATCH v2 2/3] selftests/powerpc: Add test for real address error handling

2021-08-05 Thread Ganesh Goudar

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
v2: Fix build error.
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  6 +++
 .../selftests/powerpc/mce/inject-ra-err.c | 42 +++
 .../selftests/powerpc/mce/inject-ra-err.sh| 18 
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 5 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..0f537ce86370
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,6 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_PROGS := inject-ra-err.sh
+TEST_GEN_FILES := inject-ra-err
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..05ab11cec3da
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "vas-api.h"
+
+int main(void)
+{
+   int fd, ret;
+   int *paste_addr;
+   struct vas_tx_win_open_attr attr;
+   char *devname = "/dev/crypto/nx-gzip";
+
+   memset(, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   fd = open(devname, O_RDWR);
+   if (fd < 0) {
+   fprintf(stderr, "Failed to open device %s\n", devname);
+   return -errno;
+   }
+   ret = ioctl(fd, VAS_TX_WIN_OPEN, );
+   if (ret < 0) {
+   fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno);
+   ret = -errno;
+   goto out;
+   }
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+   /* The following assignment triggers exception */
+   *paste_addr = 1;
+   ret = 0;
+out:
+   close(fd);
+   return ret;
+}
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..3633cdc651a1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Real address or Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Real address or Control memory access error is handled"
+exit 0
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1

[PATCH v2 1/3] powerpc/pseries: Parse control memory access error

2021-08-05 Thread Ganesh Goudar

Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
 arch/powerpc/platforms/pseries/ras.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -103,6 +104,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;
case MC_ERROR_TYPE_UNKNOWN:
default:
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
-- 
2.31.1

[PATCH 1/3] powerpc/pseries: Parse control memory access error

2021-07-30 Thread Ganesh Goudar

Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -103,6 +104,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;
case MC_ERROR_TYPE_UNKNOWN:
default:
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
-- 
2.31.1

[PATCH 2/3] selftests/powerpc: Add test for real address error handling

2021-07-30 Thread Ganesh Goudar

Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  6 +++
 .../selftests/powerpc/mce/inject-ra-err.c | 42 +++
 .../selftests/powerpc/mce/inject-ra-err.sh| 19 +
 4 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..0f537ce86370
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,6 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_PROGS := inject-ra-err.sh
+TEST_GEN_FILES := inject-ra-err
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..58374bc92e90
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+int main(void)
+{
+   int fd, ret;
+   int *paste_addr;
+   struct vas_tx_win_open_attr attr;
+   char *devname = "/dev/crypto/nx-gzip";
+
+   memset(, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   fd = open(devname, O_RDWR);
+   if (fd < 0) {
+   fprintf(stderr, "Failed to open device %s\n", devname);
+   return -errno;
+   }
+   ret = ioctl(fd, VAS_TX_WIN_OPEN, );
+   if (ret < 0) {
+   fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno);
+   ret = -errno;
+   goto out;
+   }
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+   /* The following assignment triggers exception */
+   *paste_addr = 1;
+   ret = 0;
+out:
+   close(fd);
+   return ret;
+}
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..0e9c8ae6ad78
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+# Timeout in 5 seconds, If not handled it may run indefinitely.
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit Code With Special Meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Control memory access error is handled"
+exit 0
-- 
2.31.1

[PATCH 3/3] powerpc/mce: Modify the real address error logging messages

2021-07-30 Thread Ganesh Goudar

To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..f3ef480bb739 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -388,14 +388,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1

Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-22 Thread Ganesh




On 4/22/21 11:31 AM, Ganesh wrote:

On 4/7/21 10:28 AM, Ganesh Goudar wrote:


When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store 
[Recovered]

[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store 
[Recovered]

[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/kernel/mce.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Hi mpe, Any comments on this patch?

Please ignore, I see its applied.

Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-22 Thread Ganesh


On 4/7/21 10:28 AM, Ganesh Goudar wrote:


When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/kernel/mce.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Hi mpe, Any comments on this patch?

Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-20 Thread Ganesh


On 4/20/21 12:54 PM, Santosh Sivaraj wrote:


Hi Ganesh,

Ganesh Goudar  writes:


When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/kernel/mce.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 11f0cae086ed..db9363e131ce 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 * Populate the mce error_type and type-specific error_type.
 */
mce_set_error_info(mce, mce_err);
+   if (mce->error_type == MCE_ERROR_TYPE_UE)
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
  
  	if (!addr)

return;
@@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   mce->u.ue_error.ignore_event = mce_err->ignore_event;
machine_check_ue_event(mce);
}
}

Small nit:
Setting ignore event can happen before the phys_addr check, under the existing
check for MCE_ERROR_TYPE_UE, instead of repeating the same condition again.


In some cases we may not get effective address also, so it is placed before 
effective
address check.



Except for the above nit

Reviewed-by: Santosh Sivaraj 

Thanks,
Santosh

--
2.26.2

Re: [PATCH] powerpc/pseries/mce: Fix a typo in error type assignment

2021-04-19 Thread Ganesh


On 4/17/21 6:06 PM, Michael Ellerman wrote:


Ganesh Goudar  writes:

The error type is ICACHE and DCACHE, for case MCE_ERROR_TYPE_ICACHE.

Do you mean "is ICACHE not DCACHE" ?


Right :), Should I send v2 ?



cheers


Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/platforms/pseries/ras.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f8b390a9d9fb..9d4ef65da7f3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
case MC_ERROR_TYPE_UNKNOWN:
default:
--
2.26.2

[PATCH] powerpc/pseries/mce: Fix a typo in error type assignment

2021-04-16 Thread Ganesh Goudar

The error type is ICACHE and DCACHE, for case MCE_ERROR_TYPE_ICACHE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f8b390a9d9fb..9d4ef65da7f3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
case MC_ERROR_TYPE_UNKNOWN:
default:
-- 
2.26.2

[PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-06 Thread Ganesh Goudar

When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 11f0cae086ed..db9363e131ce 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 * Populate the mce error_type and type-specific error_type.
 */
mce_set_error_info(mce, mce_err);
+   if (mce->error_type == MCE_ERROR_TYPE_UE)
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
if (!addr)
return;
@@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   mce->u.ue_error.ignore_event = mce_err->ignore_event;
machine_check_ue_event(mce);
}
}
-- 
2.26.2

[PATCH v5 2/2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-28 Thread Ganesh Goudar

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info.

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.

v4: Spliting the patch into two.

v5: Fix build error for PPC32.
---
 arch/powerpc/include/asm/mce.h | 18 +++
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 79 ++
 arch/powerpc/kernel/setup-common.c |  2 +
 4 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 7d8b6679ec68..331d944280b8 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -206,6 +206,17 @@ struct mce_error_info {
 
 #define MAX_MC_EVT 10
 
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
+
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
 #define MCE_EVENT_DONTRELEASE  false
@@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..6ec5c68997ed 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
+   int index = local_paca->mce_info->mce_nest_count++;
+   struct machine_check_event *mce;
 
+   mce = _paca->mce_info->mce_event[index];
/*
 * Return if we don't have enough space to log mce event.
 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
@@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = local_paca->mce_info->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(_event[index]);
+   mc_evt = _paca->mce_info->mce_event[index];
/* Copy the event structure and release the original */

[PATCH v5 1/2] powerpc/mce: Reduce the size of event arrays

2021-01-28 Thread Ganesh Goudar

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100. This saves us ~19kB
of memory and has no fatal consequences.

Signed-off-by: Ganesh Goudar 
---
v4: This patch is a fragment of the orignal patch which is 
split into two.

v5: No changes.
---
 arch/powerpc/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..7d8b6679ec68 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
-- 
2.26.2

Re: [PATCH v4 2/2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-28 Thread Ganesh


On 1/25/21 2:54 PM, Christophe Leroy wrote:




Le 22/01/2021 à 13:32, Ganesh Goudar a écrit :

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
 to allocate memory.

v4: Spliting the patch into two.
---
  arch/powerpc/include/asm/mce.h | 18 +++
  arch/powerpc/include/asm/paca.h    |  4 ++
  arch/powerpc/kernel/mce.c  | 79 ++
  arch/powerpc/kernel/setup-common.c |  2 +-
  4 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c

index 71f38e9248be..17dc451f0e45 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -916,7 +916,6 @@ void __init setup_arch(char **cmdline_p)
  /* On BookE, setup per-core TLB data structures. */
  setup_tlb_core_data();
  #endif
-


This line removal is really required for this patch ?

I will correct it, Thanks for catching.


  /* Print various info about the machine that has been gathered 
so far. */

  print_system_info();
  @@ -938,6 +937,7 @@ void __init setup_arch(char **cmdline_p)
  exc_lvl_early_init();
  emergency_stack_init();
  +    mce_init();


You have to include mce.h to avoid build failure on PPC32.

Sure, thanks

  smp_release_cpus();
    initmem_init();

[PATCH v4 2/2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-22 Thread Ganesh Goudar

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.

v4: Spliting the patch into two.
---
 arch/powerpc/include/asm/mce.h | 18 +++
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 79 ++
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 7d8b6679ec68..331d944280b8 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -206,6 +206,17 @@ struct mce_error_info {
 
 #define MAX_MC_EVT 10
 
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
+
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
 #define MCE_EVENT_DONTRELEASE  false
@@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..6ec5c68997ed 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
+   int index = local_paca->mce_info->mce_nest_count++;
+   struct machine_check_event *mce;
 
+   mce = _paca->mce_info->mce_event[index];
/*
 * Return if we don't have enough space to log mce event.
 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
@@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = local_paca->mce_info->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(_event[index]);
+   mc_evt = _paca->mce_info->mce_event[index];
/* Copy the event structure and release the original */
if (mce)

[PATCH v4 1/2] powerpc/mce: Reduce the size of event arrays

2021-01-22 Thread Ganesh Goudar

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100. This saves us ~19kB
of memory and has no fatal consequences.

Signed-off-by: Ganesh Goudar 
---
v4: This patch is a fragment of the orignal patch which is 
split into two.
---
 arch/powerpc/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..7d8b6679ec68 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
-- 
2.26.2

Re: [PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-21 Thread Ganesh


On 1/19/21 9:28 AM, Nicholas Piggin wrote:


Excerpts from Ganesh Goudar's message of January 15, 2021 10:58 pm:

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Seems okay.


Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Could you make this a separate patch, with memory saving numbers?
"Delayed" MCEs are not necessarily the same as recursive (several
sequential MCEs can occur before the first event is processed).
But I agree 100 is pretty overboard (as is 4 recursive MCEs really).


Sure.


Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
 to allocate memory.
---
  arch/powerpc/include/asm/mce.h | 21 -
  arch/powerpc/include/asm/paca.h|  4 ++
  arch/powerpc/kernel/mce.c  | 76 +-
  arch/powerpc/kernel/setup-common.c |  2 +-
  4 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..8d6e3a7a9f37 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,18 @@ struct mce_error_info {
boolignore_event;
  };
  
-#define MAX_MC_EVT	100

+#define MAX_MC_EVT 10
+
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
  
  /* Release flags for get_mce_event() */

  #define MCE_EVENT_RELEASE true
@@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs 
*regs);
  long __machine_check_early_realmode_p8(struct pt_regs *regs);
  long __machine_check_early_realmode_p9(struct pt_regs *regs);
  long __machine_check_early_realmode_p10(struct pt_regs *regs);
+#define get_mce_info() local_paca->mce_info

I don't think this adds anything. Could you open code it?


ok


Thanks,
Nick

[PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-15 Thread Ganesh Goudar

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.
---
 arch/powerpc/include/asm/mce.h | 21 -
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 76 +-
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..8d6e3a7a9f37 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,18 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
+
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
@@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p8(struct pt_regs *regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
+#define get_mce_info() local_paca->mce_info
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..feeb3231b541 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,8 +94,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
+   int index = get_mce_info()->mce_nest_count++;
+   struct machine_check_event *mce = _mce_info()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +182,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_mce_info()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +192,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(_event[index]);
+   mc_evt = _mce_info()->mce_event[ind

[PATCH v2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-07 Thread Ganesh Goudar

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info
---
 arch/powerpc/include/asm/mce.h | 21 +++-
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 86 ++
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..8d6e3a7a9f37 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,18 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
+
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
@@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p8(struct pt_regs *regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
+#define get_mce_info() local_paca->mce_info
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..14142ddbedf2 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,23 +17,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
-
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
@@ -103,8 +92,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
+   int index = get_mce_info()->mce_nest_count++;
+   struct machine_check_event *mce = _mce_info()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +180,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_mce_info()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +190,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(_event[index]);
+   mc_evt = _mce_info()->mce_event[index];
/* Copy the event str

Re: [PATCH] powerpc/mce: Remove per cpu variables from MCE handlers

2020-12-08 Thread Ganesh




On 12/8/20 4:01 PM, Michael Ellerman wrote:

Ganesh Goudar  writes:

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..4769954efa7d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -273,6 +274,17 @@ struct paca_struct {
  #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
  #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+#endif /* CONFIG_PPC_BOOK3S_64 */
  } cacheline_aligned;

How much does this expand the paca by?


Size of paca is 4480 bytes, these add up another 2160 bytes, so expands it by 
48%.

[PATCH] powerpc/mce: Remove per cpu variables from MCE handlers

2020-12-04 Thread Ganesh Goudar

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h  |  2 +-
 arch/powerpc/include/asm/paca.h | 12 
 arch/powerpc/kernel/mce.c   | 54 +
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 89aa8248a57d..feef45f2b51b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..4769954efa7d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,17 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 63702c0badb9..5f53d02d6cbb 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -22,18 +22,6 @@
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
-
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
@@ -103,8 +91,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
+   int index = get_paca()->mce_nest_count++;
+   struct machine_check_event *mce = _paca()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +179,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_paca()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +189,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(_event[index]);
+   mc_evt = _paca()->mce_event[index];
/* Copy the event structure and release the original */
if (mce)
*mce = *mc_evt;
@@ -211,7 +199,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
}
/* Decrement the count to free the slot. */
if (release)
-   __this_cpu_dec(mce_nest_count);
+   get_paca()->mce_nest_count--;
 
return ret;
 }
@@ -233,13 +221,13 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
 {
int index;
 
-   index = __this_cpu_inc_return(mce_ue_count) - 1;
+   index = get_paca()->mce_ue_count++;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
-   __this_cpu_dec(mce_ue_count);
+

[PATCH v5] lkdtm/powerpc: Add SLB multihit test

2020-11-30 Thread Ganesh Goudar

To check machine check handling, add support to inject slb
multihit errors.

Cc: Kees Cook 
Cc: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
v5:
- Insert entries at SLB_NUM_BOLTED and SLB_NUM_BOLTED +1, remove index
  allocating helper function.
- Move mk_esid_data and mk_vsid_data helpers to asm/book3s/64/mmu-hash.h.
- Use mmu_linear_psize and mmu_vmalloc_psize to get page size.
- Use !radix_enabled() to check if we are in HASH mode.
- And other minor improvements.

v1-v4:
- No major changes here for this patch, This patch was initially posted
  along with the other patch which got accepted.
https://git.kernel.org/powerpc/c/8d0e2101274358d9b6b1f27232b40253ca48bab5
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  28 +++-
 arch/powerpc/mm/book3s64/hash_utils.c |   1 +
 arch/powerpc/mm/book3s64/slb.c|  27 
 drivers/misc/lkdtm/Makefile   |   1 +
 drivers/misc/lkdtm/core.c |   3 +
 drivers/misc/lkdtm/lkdtm.h|   3 +
 drivers/misc/lkdtm/powerpc.c  | 120 ++
 tools/testing/selftests/lkdtm/tests.txt   |   1 +
 8 files changed, 156 insertions(+), 28 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 683a9c7d1b03..8b9f07900395 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -842,6 +842,32 @@ static inline unsigned long get_kernel_vsid(unsigned long 
ea, int ssize)
 
 unsigned htab_shift_for_mem_size(unsigned long mem_size);
 
-#endif /* __ASSEMBLY__ */
+enum slb_index {
+   LINEAR_INDEX= 0, /* Kernel linear map  (0xc000) */
+   KSTACK_INDEX= 1, /* Kernel stack map */
+};
 
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+enum slb_index index)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+  unsigned long flags)
+{
+   return (vsid << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 24702c0a92e0..38076a998850 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -112,6 +112,7 @@ int mmu_linear_psize = MMU_PAGE_4K;
 EXPORT_SYMBOL_GPL(mmu_linear_psize);
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int mmu_vmemmap_psize = MMU_PAGE_4K;
 #endif
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c30fcbfa0e32..985706acb0e5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -28,35 +28,8 @@
 #include "internal.h"
 
 
-enum slb_index {
-   LINEAR_INDEX= 0, /* Kernel linear map  (0xc000) */
-   KSTACK_INDEX= 1, /* Kernel stack map */
-};
-
 static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
 
-#define slb_esid_mask(ssize)   \
-   (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-enum slb_index index)
-{
-   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
-unsigned long flags)
-{
-   return (vsid << slb_vsid_shift(ssize)) | flags |
-   ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
-unsigned long flags)
-{
-   return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
-
 bool stress_slb_enabled __initdata;
 
 static int __init parse_stress_slb(char *p)
diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..f37ecfb0a707 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)  += usercopy.o
 lkdtm-$(CONFIG_LKDTM)

Re: [PATCH v4 2/2] lkdtm/powerpc: Add SLB multihit test

2020-11-26 Thread Ganesh




On 10/19/20 6:45 PM, Michal Suchánek wrote:


On Mon, Oct 19, 2020 at 09:59:57PM +1100, Michael Ellerman wrote:

Hi Ganesh,

Some comments below ...

Ganesh Goudar  writes:

To check machine check handling, add support to inject slb
multihit errors.

Cc: Kees Cook 
Reviewed-by: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
  drivers/misc/lkdtm/Makefile |   1 +
  drivers/misc/lkdtm/core.c   |   3 +
  drivers/misc/lkdtm/lkdtm.h  |   3 +
  drivers/misc/lkdtm/powerpc.c| 156 
  tools/testing/selftests/lkdtm/tests.txt |   1 +
  5 files changed, 164 insertions(+)
  create mode 100644 drivers/misc/lkdtm/powerpc.c


..

diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
new file mode 100644
index ..f388b53dccba
--- /dev/null
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "lkdtm.h"
+#include 
+#include 

Usual style is to include the linux headers first and then the local header.


ok


+
+/* Gets index for new slb entry */
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}

I'm not sure we need that really?

We can just always insert at SLB_MUM_BOLTED and SLB_NUM_BOLTED + 1.

Or we could allocate from the top down using mmu_slb_size - 1, and
mmu_slb_size - 2.


Ok, We can do that.


+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}

I realise it's not much code, but I'd rather those were in a header,
rather than copied from slb.c. That way they can never skew vs the
versions in slb.c

Best place I think would be arch/powerpc/include/asm/book3s/64/mmu-hash.h


Ok, ill move them.


+
+/* Inserts new slb entry */

It inserts two.


Right.


+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;

That won't work if the kernel is built for 4K pages. Or at least it
won't work the way we want it to.

You should use mmu_linear_psize.

But for vmalloc you should use mmu_vmalloc_psize, so it will need to be
a parameter.


Sure, Thanks


+   preempt_disable();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   /*
+* This triggers exception, If handled correctly we must recover
+* from this error.
+*/
+   p[0] = '!';

That doesn't belong in here, it should be done by the caller.

That would also mean p could be unsigned long in here, so you wouldn't
have to cast it four times.


Sure, ill change it.


+}
+
+/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);

vmalloc() allocates whole pages, so it may as well be vmalloc(PAGE_SIZE).


ok


+   if (!p)
+   return;

That's unlikely, but it should be an error that's propagated up to the caller.


ok


+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+

Re: [PATCH v4] powerpc/pseries: Avoid using addr_to_pfn in real mode

2020-10-20 Thread Ganesh


On 7/24/20 12:09 PM, Ganesh Goudar wrote:


When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in real mode, which can be fatal as it may try to access
memory outside RMO region.

Have two helper functions to separate things to be done in real mode
and virtual mode without changing any functionality. This also fixes
the following error as the use of addr_to_pfn is now moved to virtual
mode.

Without this change following kernel crash is seen on hitting UE.

[  485.128036] Oops: Kernel access of bad area, sig: 11 [#1]
[  485.128040] LE SMP NR_CPUS=2048 NUMA pSeries
[  485.128047] Modules linked in:
[  485.128067] CPU: 15 PID: 6536 Comm: insmod Kdump: loaded Tainted: G OE 5.7.0 
#22
[  485.128074] NIP:  c009b24c LR: c00398d8 CTR: c0cd57c0
[  485.128078] REGS: c3f1f970 TRAP: 0300   Tainted: G OE (5.7.0)
[  485.128082] MSR:  80001003   CR: 28008284  XER: 0001
[  485.128088] CFAR: c009b190 DAR: c001fab0 DSISR: 4000 
IRQMASK: 1
[  485.128088] GPR00: 0001 c3f1fbf0 c1634300 
b0fa0100
[  485.128088] GPR04: d222  fab0 
0022
[  485.128088] GPR08: c001fab0  c001fab0 
c3f1fc14
[  485.128088] GPR12: 0008 c3ff5880 d218 

[  485.128088] GPR16: ff20 fff1 fff2 
d21a1100
[  485.128088] GPR20: d220 c0015c893c50 c0d49b28 
c0015c893c50
[  485.128088] GPR24: d21a0d08 c14e5da8 d21a0818 
000a
[  485.128088] GPR28: 0008 000a c17e2970 
000a
[  485.128125] NIP [c009b24c] __find_linux_pte+0x11c/0x310
[  485.128130] LR [c00398d8] addr_to_pfn+0x138/0x170
[  485.128133] Call Trace:
[  485.128135] Instruction dump:
[  485.128138] 3929 7d4a3378 7c883c36 7d2907b4 794a1564 7d294038 794af082 
3900
[  485.128144] 79291f24 790af00e 78e70020 7d095214 <7c69502a> 2fa3 419e011c 
70690040
[  485.128152] ---[ end trace d34b27e29ae0e340 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common event 
code")
Signed-off-by: Ganesh Goudar 
---
V2: Leave bare metal code and save_mce_event as is.

V3: Have separate functions for realmode and virtual mode handling.

V4: Fix build warning, rephrase commit message.
---
  arch/powerpc/platforms/pseries/ras.c | 118 ---
  1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..c509e43bac23 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -522,18 +522,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
  }
  
+static int mce_handle_err_realmode(int disposition, u8 error_type)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (disposition == RTAS_DISP_NOT_RECOVERED) {
+   switch (error_type) {
+   caseMC_ERROR_TYPE_SLB:
+   caseMC_ERROR_TYPE_ERAT:
+   /*
+* Store the old slb content in paca before flushing.
+* Print this when we go to virtual mode.
+* There are chances that we may hit MCE again if there
+* is a parity error on the SLB entry we trying to read
+* for saving. Hence limit the slb saving to single
+* level of recursion.
+*/
+   if (local_paca->in_mce == 1)
+   slb_save_contents(local_paca->mce_faulty_slbs);
+   flush_and_reload_slb();
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   break;
+   default:
+   break;
+   }
+   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+   /* Platform corrected itself but could be degraded */
+   pr_err("MCE: limited recovery, system may be degraded\n");
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
+#endif
+   return disposition;
+}
  
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)

+static int mce_handle_err_virtmode(struct pt_regs *regs,
+  struct rtas_error_log *errp,
+  struct pseries_mc_errorlog *mce_log,
+  int disposition)
  {
struct mce_error_info mce_err = { 0 };
-   un

Re: [PATCH v4 0/2] powerpc/mce: Fix mce handler and add selftest

2020-10-18 Thread Ganesh


On 10/16/20 5:02 PM, Michael Ellerman wrote:


On Fri, 9 Oct 2020 12:10:03 +0530, Ganesh Goudar wrote:

This patch series fixes mce handling for pseries, Adds LKDTM test
for SLB multihit recovery and enables selftest for the same,
basically to test MCE handling on pseries/powernv machines running
in hash mmu mode.

v4:
* Use radix_enabled() to check if its in Hash or Radix mode.
* Use FW_FEATURE_LPAR instead of machine_is_pseries().

[...]

Patch 1 applied to powerpc/fixes.

[1/2] powerpc/mce: Avoid nmi_enter/exit in real mode on pseries hash
   https://git.kernel.org/powerpc/c/8d0e2101274358d9b6b1f27232b40253ca48bab5

cheers

Thank you, Any comments on patch 2.

[PATCH v4 2/2] lkdtm/powerpc: Add SLB multihit test

2020-10-09 Thread Ganesh Goudar

To check machine check handling, add support to inject slb
multihit errors.

Cc: Kees Cook 
Reviewed-by: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 5 files changed, 164 insertions(+)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..f37ecfb0a707 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)  += usercopy.o
 lkdtm-$(CONFIG_LKDTM)  += stackleak.o
 lkdtm-$(CONFIG_LKDTM)  += cfi.o
+lkdtm-$(CONFIG_PPC64)  += powerpc.o
 
 KASAN_SANITIZE_stackleak.o := n
 KCOV_INSTRUMENT_rodata.o   := n
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index a5e344df9166..8d5db42baa90 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -178,6 +178,9 @@ static const struct crashtype crashtypes[] = {
 #ifdef CONFIG_X86_32
CRASHTYPE(DOUBLE_FAULT),
 #endif
+#ifdef CONFIG_PPC64
+   CRASHTYPE(PPC_SLB_MULTIHIT),
+#endif
 };
 
 
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 8878538b2c13..b305bd511ee5 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -104,4 +104,7 @@ void lkdtm_STACKLEAK_ERASING(void);
 /* cfi.c */
 void lkdtm_CFI_FORWARD_PROTO(void);
 
+/* powerpc.c */
+void lkdtm_PPC_SLB_MULTIHIT(void);
+
 #endif
diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
new file mode 100644
index ..f388b53dccba
--- /dev/null
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "lkdtm.h"
+#include 
+#include 
+
+/* Gets index for new slb entry */
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}
+
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+/* Inserts new slb entry */
+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;
+   preempt_disable();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   /*
+* This triggers exception, If handled correctly we must recover
+* from this error.
+*/
+   p[0] = '!';
+}
+
+/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   kfree(p);
+}
+
+/*
+ * Few initial SLB entries are bolted. Add a test to inject
+ * multihit in bolted entry 0.
+ */
+static void inse

[PATCH v4 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-10-09 Thread Ganesh Goudar

Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..63702c0badb9 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,12 +591,11 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
-
-   if (!nested)
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
nmi_enter();
 
hv_nmi_check_nonrecoverable(regs);
@@ -607,7 +606,7 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
 
-   if (!nested)
+   if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
nmi_exit();
 
this_cpu_set_ftrace_enabled(ftrace_enabled);
-- 
2.26.2

[PATCH v4 0/2] powerpc/mce: Fix mce handler and add selftest

2020-10-09 Thread Ganesh Goudar

This patch series fixes mce handling for pseries, Adds LKDTM test
for SLB multihit recovery and enables selftest for the same,
basically to test MCE handling on pseries/powernv machines running
in hash mmu mode.

v4:
* Use radix_enabled() to check if its in Hash or Radix mode.
* Use FW_FEATURE_LPAR instead of machine_is_pseries().

v3:
* Merging selftest changes with patch 2/2, Instead of having separate
  patch.
* Minor improvements like adding enough comments, Makefile changes,
  including header file and adding some prints.

v2:
* Remove in_nmi check before calling nmi_enter/exit,
  as nesting is supported.
* Fix build errors and remove unused variables.
* Integrate error injection code into LKDTM.
* Add support to inject multihit in paca.


Ganesh Goudar (2):
  powerpc/mce: remove nmi_enter/exit from real mode handler
  lkdtm/powerpc: Add SLB multihit test

 arch/powerpc/kernel/mce.c   |   7 +-
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 6 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

-- 
2.26.2

Re: [PATCH v3 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-10-09 Thread Ganesh


On 10/1/20 11:21 PM, Ganesh Goudar wrote:


Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
  arch/powerpc/kernel/mce.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..3bf39dd5dd43 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,12 +591,14 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
  long notrace machine_check_early(struct pt_regs *regs)
  {
long handled = 0;
-   bool nested = in_nmi();
+   bool is_pseries_hpt_guest;
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
  
  	this_cpu_set_ftrace_enabled(0);

-
-   if (!nested)
+   is_pseries_hpt_guest = machine_is(pseries) &&
+  mmu_has_feature(MMU_FTR_HPTE_TABLE);
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   if (!is_pseries_hpt_guest)


In an offline discussion mpe suggested to use radix_enabled() to check if it is
radix or hash, as MMU_FTR_HPTE_TABLE may be true on radix machines also and use
of FW_FEATURE_LPAR better than machine_is(pseries), sending v4 with these 
changes.


nmi_enter();
  
  	hv_nmi_check_nonrecoverable(regs);

@@ -607,7 +609,7 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
  
-	if (!nested)

+   if (!is_pseries_hpt_guest)
nmi_exit();
  
  	this_cpu_set_ftrace_enabled(ftrace_enabled);

1 2 >

1 - 100 of 151 matches

Mail list logo