[PATCH v3] powerpc/eeh: avoid possible crash when edev->pdev changes
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev will change and can cause a crash, hold the PCI rescan/remove lock while taking a copy of edev->pdev->bus. Signed-off-by: Ganesh Goudar --- v2: Hold rescan lock till we get the bus address. v3: Now that we are taking copy of bus, holding the lock, update the commit message accordingly. --- arch/powerpc/kernel/eeh_pe.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d1030bc52564..d283d281d28e 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) { struct eeh_dev *edev; struct pci_dev *pdev; + struct pci_bus *bus = NULL; if (pe->type & EEH_PE_PHB) return pe->phb->bus; @@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) /* Retrieve the parent PCI bus of first (top) PCI device */ edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry); + pci_lock_rescan_remove(); pdev = eeh_dev_to_pci_dev(edev); if (pdev) - return pdev->bus; + bus = pdev->bus; + pci_unlock_rescan_remove(); - return NULL; + return bus; } -- 2.44.0
[PATCH v2 0/1] Parallel EEH recovery between PHBs
This change is based on Sam Bobroff's patches which aimed to allow recovery to happen in parallel between PHBs and PEs, Due to various reasons the patches did not get in. But having parallel recovery between PHBs is fairly simple and gives significant improvement on powervm, Since powervm maintains flat hierarchy for PCI devices. This patch enables PHBs to have separate event queues and shorten the time taken for EEH recovery by making the recovery to run in parallel between PHBs. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv the improvement is not so significant. Ganesh Goudar (1): powerpc/eeh: Enable PHBs to recovery in parallel arch/powerpc/include/asm/eeh_event.h | 7 arch/powerpc/include/asm/pci-bridge.h | 4 ++ arch/powerpc/kernel/eeh_driver.c | 27 +++- arch/powerpc/kernel/eeh_event.c | 59 +-- arch/powerpc/kernel/eeh_pe.c | 4 ++ 5 files changed, 78 insertions(+), 23 deletions(-) -- 2.44.0
[PATCH v2 1/1] powerpc/eeh: Enable PHBs to recovery in parallel
Currnetly, with a single event queue EEH recovery is entirely serialized and takes place within a single kernel thread. This can cause recovery to take a long time when there are many devices. Have the recovery event queue per PHB and allow the recovery to happen in parallel for all the PHBs. Signed-off-by: Ganesh Goudar --- v2: Include missing hunk, which modifies __eeh_send_failure_event. --- arch/powerpc/include/asm/eeh_event.h | 7 arch/powerpc/include/asm/pci-bridge.h | 4 ++ arch/powerpc/kernel/eeh_driver.c | 27 +++- arch/powerpc/kernel/eeh_event.c | 59 +-- arch/powerpc/kernel/eeh_pe.c | 4 ++ 5 files changed, 78 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index dadde7d52f46..6af1b5bb6103 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -8,6 +8,8 @@ #define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ +#include + /* * structure holding pci controller data that describes a * change in the isolation status of a PCI slot. A pointer @@ -15,15 +17,20 @@ * callback. */ struct eeh_event { + struct work_struct work; struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ }; +extern spinlock_t eeh_eventlist_lock; + int eeh_event_init(void); +int eeh_phb_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(struct eeh_pe *pe); +void eeh_handle_normal_event_work(struct work_struct *work); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20..61884d9398bf 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -138,6 +138,10 @@ struct pci_controller { /* iommu_ops support */ struct iommu_device iommu; + + bool eeh_in_progress; + struct list_head eeh_eventlist; + spinlock_t eeh_eventlist_lock; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 7efe04c68f0f..4cf5fd409369 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1116,6 +1116,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } +void eeh_handle_normal_event_work(struct work_struct *work) +{ + unsigned long flags; + struct eeh_event *event = container_of(work, struct eeh_event, work); + struct pci_controller *phb = event->pe->phb; + + eeh_handle_normal_event(event->pe); + + kfree(event); + spin_lock_irqsave(>eeh_eventlist_lock, flags); + WARN_ON_ONCE(!phb->eeh_in_progress); + if (list_empty(>eeh_eventlist)) { + phb->eeh_in_progress = false; + pr_debug("EEH: No more work to do\n"); + } else { + pr_warn("EEH: More work to do\n"); + event = list_entry(phb->eeh_eventlist.next, + struct eeh_event, list); + list_del(>list); + queue_work(system_unbound_wq, >work); + } + spin_unlock_irqrestore(>eeh_eventlist_lock, flags); +} + /** * eeh_handle_special_event - Handle EEH events without a specific failing PE * @@ -1185,8 +1209,7 @@ void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - eeh_handle_normal_event(pe); + eeh_phb_event(pe); } else { eeh_for_each_pe(pe, tmp_pe) eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index c23a454af08a..8a9d6358d39f 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -22,7 +22,7 @@ * work-queue, where a worker thread can drive recovery. */ -static DEFINE_SPINLOCK(eeh_eventlist_lock); +DEFINE_SPINLOCK(eeh_eventlist_lock); static DECLARE_COMPLETION(eeh_eventlist_event); static LIST_HEAD(eeh_eventlist); @@ -91,6 +91,42 @@ int eeh_event_init(void) return 0; } +int eeh_phb_event(struct eeh_pe *pe) +{ + struct eeh_event *event; + unsigned long flags; + struct pci_controller *phb; + + event = kzalloc(sizeof(*event), GFP_ATOMIC); + if (!event) + return -ENOMEM; + + if (pe) { +
[PATCH v2] powerpc/eeh: avoid possible crash when edev->pdev changes
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev will change and can cause a crash, hold the PCI rescan/remove lock while taking a copy of edev->pdev. Signed-off-by: Ganesh Goudar --- v2: Hold rescan lock till we get the bus address. --- arch/powerpc/kernel/eeh_pe.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d1030bc52564..d283d281d28e 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) { struct eeh_dev *edev; struct pci_dev *pdev; + struct pci_bus *bus = NULL; if (pe->type & EEH_PE_PHB) return pe->phb->bus; @@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) /* Retrieve the parent PCI bus of first (top) PCI device */ edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry); + pci_lock_rescan_remove(); pdev = eeh_dev_to_pci_dev(edev); if (pdev) - return pdev->bus; + bus = pdev->bus; + pci_unlock_rescan_remove(); - return NULL; + return bus; } -- 2.44.0
Re: [PATCH] powerpc/eeh: avoid possible crash when edev->pdev changes
On 6/11/24 8:18 AM, Michael Ellerman wrote: Hi Ganesh, Ganesh Goudar writes: If a PCI device is removed during eeh_pe_report_edev(), edev->pdev will change and can cause a crash, hold the PCI rescan/remove lock while taking a copy of edev->pdev. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/eeh_pe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d1030bc52564..49f968733912 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -859,7 +859,9 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) /* Retrieve the parent PCI bus of first (top) PCI device */ edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry); + pci_lock_rescan_remove(); pdev = eeh_dev_to_pci_dev(edev); + pci_unlock_rescan_remove(); if (pdev) return pdev->bus; What prevents pdev being freed/reused immediately after you drop the rescan/remove lock? Yeah, I should have released the lock after getting bus address, I will send v2. AFAICS eeh_dev_to_pci_dev() doesn't take an additional reference to the pdev or anything. Yes, I think we have to evaluate the possible eventualities of not taking the reference in all the cases. But we need this lock here because, if the PCI error is encountered in the hotplug remove path, we need the pci rescan lock to avoid race between hotplug remove path and the bottom half of EEH recovery, this lets the hotplug remove to complete since it is already holding the lock and drop the recovery process as the device is no longer present.
[PATCH] powerpc/eeh: avoid possible crash when edev->pdev changes
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev will change and can cause a crash, hold the PCI rescan/remove lock while taking a copy of edev->pdev. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/eeh_pe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d1030bc52564..49f968733912 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -859,7 +859,9 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) /* Retrieve the parent PCI bus of first (top) PCI device */ edev = list_first_entry_or_null(>edevs, struct eeh_dev, entry); + pci_lock_rescan_remove(); pdev = eeh_dev_to_pci_dev(edev); + pci_unlock_rescan_remove(); if (pdev) return pdev->bus; -- 2.44.0
[PATCH v2] powerpc/eeh: Permanently disable the removed device
When a device is hot removed on powernv, the hotplug driver clears the device's state. However, on pseries, if a device is removed by phyp after reaching the error threshold, the kernel remains unaware, leading to the device not being torn down. This prevents necessary remediation actions like failover. Permanently disable the device if the presence check fails. Also, in eeh_dev_check_failure in we may consider the error as false positive if the device is hotpluged out as the get_state call returns EEH_STATE_NOT_SUPPORT and we may end up not clearing the device state, so log the event if the state is not moved to permanent failure state. Signed-off-by: Ganesh Goudar --- V2: * Elobrate the commit message. * Fix formatting issues in commit message and comments. --- arch/powerpc/kernel/eeh.c| 11 ++- arch/powerpc/kernel/eeh_driver.c | 13 +++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..6670063a7a6c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -506,9 +506,18 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * We will punt with the following conditions: Failure to get * PE's state, EEH not support and Permanently unavailable * state, PE is in good state. +* +* On the pSeries, after reaching the threshold, get_state might +* return EEH_STATE_NOT_SUPPORT. However, it's possible that the +* device state remains uncleared if the device is not marked +* pci_channel_io_perm_failure. Therefore, consider logging the +* event to let device removal happen. +* */ if ((ret < 0) || - (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) { + (ret == EEH_STATE_NOT_SUPPORT && +dev->error_state == pci_channel_io_perm_failure) || + eeh_state_active(ret)) { eeh_stats.false_positives++; pe->false_positives++; rc = 0; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 48773d2d9be3..7efe04c68f0f 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -865,9 +865,18 @@ void eeh_handle_normal_event(struct eeh_pe *pe) devices++; if (!devices) { - pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pr_warn("EEH: Frozen PHB#%x-PE#%x is empty!\n", pe->phb->global_number, pe->addr); - goto out; /* nothing to recover */ + /* +* The device is removed, tear down its state, on powernv +* hotplug driver would take care of it but not on pseries, +* permanently disable the card as it is hot removed. +* +* In the case of powernv, note that the removal of device +* is covered by pci rescan lock, so no problem even if hotplug +* driver attempts to remove the device. +*/ + goto recover_failed; } /* Log the event */ -- 2.44.0
Re: [PATCH] powerpc/eeh: Permanently disable the removed device
On 4/9/24 14:37, Michael Ellerman wrote: Hi Ganesh, Ganesh Goudar writes: When a device is hot removed on powernv, the hotplug driver clears the device's state. However, on pseries, if a device is removed by phyp after reaching the error threshold, the kernel remains unaware, leading to the device not being torn down. This prevents necessary remediation actions like failover. Permanently disable the device if the presence check fails. You can wrap your changelogs a bit wider, 70 or 80 columns is fine. ok diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..8d1606406d3f 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -508,7 +508,9 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * state, PE is in good state. */ if ((ret < 0) || - (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) { + (ret == EEH_STATE_NOT_SUPPORT && +dev->error_state == pci_channel_io_perm_failure) || + eeh_state_active(ret)) { eeh_stats.false_positives++; pe->false_positives++; rc = 0; How does this hunk relate the changelog? This is adding an extra condition to the false positive check, so there's a risk this causes devices to go into failure when previously they didn't, right? So please explain why it's a good change. The comment above the if needs updating too. We need this change to log the event and get the device removed, I will explain this in commit message. diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 48773d2d9be3..10317badf471 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -867,7 +867,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (!devices) { pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", pe->phb->global_number, pe->addr); - goto out; /* nothing to recover */ The other cases that go to recover_failed usually print something at warn level, so this probably should too. So either make the above a pr_warn(), or change it to a warn with a more helpful message. ok + /* +* The device is removed, Tear down its state, +* On powernv hotplug driver would take care of +* it but not on pseries, Permanently disable the +* card as it is hot removed. +*/ Formatting and punctuation is weird. It can be wider, and capital letter is only required after a full stop, not a comma. ok, i will take care of it. Also you say that the powernv hotplug driver "would" take care of it, that's past tense, is that what you mean? Does the powernv hotplug driver still take care of it after this change? And (how) does that driver cope with it happening here also? Yes, hotplug driver can still remove the device and the removal of device is covered by pci rescan lock. + goto recover_failed; } cheers
[PATCH] powerpc/eeh: Permanently disable the removed device
When a device is hot removed on powernv, the hotplug driver clears the device's state. However, on pseries, if a device is removed by phyp after reaching the error threshold, the kernel remains unaware, leading to the device not being torn down. This prevents necessary remediation actions like failover. Permanently disable the device if the presence check fails. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/eeh.c| 4 +++- arch/powerpc/kernel/eeh_driver.c | 8 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..8d1606406d3f 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -508,7 +508,9 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * state, PE is in good state. */ if ((ret < 0) || - (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) { + (ret == EEH_STATE_NOT_SUPPORT && +dev->error_state == pci_channel_io_perm_failure) || + eeh_state_active(ret)) { eeh_stats.false_positives++; pe->false_positives++; rc = 0; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 48773d2d9be3..10317badf471 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -867,7 +867,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (!devices) { pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", pe->phb->global_number, pe->addr); - goto out; /* nothing to recover */ + /* +* The device is removed, Tear down its state, +* On powernv hotplug driver would take care of +* it but not on pseries, Permanently disable the +* card as it is hot removed. +*/ + goto recover_failed; } /* Log the event */ -- 2.44.0
[PATCH 1/1] powerpc/eeh: Enable PHBs to recovery in parallel
Currnetly, With a single event queue EEH recovery is entirely serialized and takes place within a single kernel thread. This can cause recovery to take a long time when there are many devices. Have the recovery event queue per PHB and allow the recovery to happen independently from other PHBs. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh_event.h | 7 + arch/powerpc/include/asm/pci-bridge.h | 4 +++ arch/powerpc/kernel/eeh_driver.c | 27 +-- arch/powerpc/kernel/eeh_event.c | 38 ++- arch/powerpc/kernel/eeh_pe.c | 4 +++ 5 files changed, 77 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index dadde7d52f46..6af1b5bb6103 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -8,6 +8,8 @@ #define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ +#include + /* * structure holding pci controller data that describes a * change in the isolation status of a PCI slot. A pointer @@ -15,15 +17,20 @@ * callback. */ struct eeh_event { + struct work_struct work; struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ }; +extern spinlock_t eeh_eventlist_lock; + int eeh_event_init(void); +int eeh_phb_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(struct eeh_pe *pe); +void eeh_handle_normal_event_work(struct work_struct *work); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20..61884d9398bf 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -138,6 +138,10 @@ struct pci_controller { /* iommu_ops support */ struct iommu_device iommu; + + bool eeh_in_progress; + struct list_head eeh_eventlist; + spinlock_t eeh_eventlist_lock; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 48773d2d9be3..d5612303766e 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1107,6 +1107,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } +void eeh_handle_normal_event_work(struct work_struct *work) +{ + unsigned long flags; + struct eeh_event *event = container_of(work, struct eeh_event, work); + struct pci_controller *phb = event->pe->phb; + + eeh_handle_normal_event(event->pe); + + kfree(event); + spin_lock_irqsave(>eeh_eventlist_lock, flags); + WARN_ON_ONCE(!phb->eeh_in_progress); + if (list_empty(>eeh_eventlist)) { + phb->eeh_in_progress = false; + pr_debug("EEH: No more work to do\n"); + } else { + pr_warn("EEH: More work to do\n"); + event = list_entry(phb->eeh_eventlist.next, + struct eeh_event, list); + list_del(>list); + queue_work(system_unbound_wq, >work); + } + spin_unlock_irqrestore(>eeh_eventlist_lock, flags); +} + /** * eeh_handle_special_event - Handle EEH events without a specific failing PE * @@ -1176,8 +1200,7 @@ void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - eeh_handle_normal_event(pe); + eeh_phb_event(pe); } else { eeh_for_each_pe(pe, tmp_pe) eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index c23a454af08a..86c0a988389e 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -22,7 +22,7 @@ * work-queue, where a worker thread can drive recovery. */ -static DEFINE_SPINLOCK(eeh_eventlist_lock); +DEFINE_SPINLOCK(eeh_eventlist_lock); static DECLARE_COMPLETION(eeh_eventlist_event); static LIST_HEAD(eeh_eventlist); @@ -91,6 +91,42 @@ int eeh_event_init(void) return 0; } +int eeh_phb_event(struct eeh_pe *pe) +{ + struct eeh_event *event; + unsigned long flags; + struct pci_controller *phb; + + event = kzalloc(sizeof(*event), GFP_ATOMIC); + if (!event) + return -ENOMEM; + + if (pe) { + phb = pe->phb; + event->pe = pe; +
[PATCH 0/1] Parallel EEH recovery between PHBs
This change is based on Sam Bobroff's patches which aimed to allow recovery to happen in parallel between PHBs and PEs, Due to various reasons the patches did not get in. But having parallel recovery between PHBs is fairly simple and gives significant improvement on powervm, Since powervm maintains flat hierarchy for PCI devices. This patch enables PHBs to have separate event queues and shorten the time taken for EEH recovery by making the recovery to run in parallel between PHBs. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv the improvement is not so significant. Ganesh Goudar (1): powerpc/eeh: Enable PHBs to recovery in parallel arch/powerpc/include/asm/eeh_event.h | 7 + arch/powerpc/include/asm/pci-bridge.h | 4 +++ arch/powerpc/kernel/eeh_driver.c | 27 +-- arch/powerpc/kernel/eeh_event.c | 38 ++- arch/powerpc/kernel/eeh_pe.c | 4 +++ 5 files changed, 77 insertions(+), 3 deletions(-) -- 2.43.2
[RFC PATCH v2 3/3] powerpc/eeh: Asynchronous recovery
Based on the original work from Sam Bobroff. Currently, EEH recovery is entirely serialized and takes place within a single kernel thread. This can cause recovery to take a long time when there are many devices. To shorten recovery time, this change allows recovery to proceed in parallel in two ways: - Each PHB is given it's own recovery event queue and can be recovered independently from other PHBs. - Driver handlers are called in parallel, but with the constraint that handlers higher up (closer to the PHB) in the PE hierarchy must be called before those lower down. To maintain the constraint, above, the driver handlers are called by traversing the tree of affected PEs from the top, stopping to call handlers (in parallel) when a PE with devices is discovered. When the calls for that PE are complete, traversal continues at each child PE. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h| 1 + arch/powerpc/include/asm/eeh_event.h | 7 + arch/powerpc/include/asm/pci-bridge.h | 4 + arch/powerpc/kernel/eeh.c | 5 - arch/powerpc/kernel/eeh_driver.c | 323 +++--- arch/powerpc/kernel/eeh_event.c | 69 +++--- arch/powerpc/kernel/eeh_pe.c | 4 + 7 files changed, 294 insertions(+), 119 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 95708c801f27..a99472635350 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */ #define EEH_DEV_SYSFS (1 << 9)/* Sysfs created*/ #define EEH_DEV_REMOVED(1 << 10) /* Removed permanently */ +#define EEH_DEV_RECOVERING (1 << 11) /* Recovering */ struct eeh_dev { int mode; /* EEH mode */ diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index a1fe736bc4cf..b21f49e87b7b 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -8,6 +8,8 @@ #define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ +#include + /* * structure holding pci controller data that describes a * change in the isolation status of a PCI slot. A pointer @@ -15,16 +17,21 @@ * callback. */ struct eeh_event { + struct work_struct work; struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ unsigned intid; /* Event ID */ }; +extern spinlock_t eeh_eventlist_lock; + int eeh_event_init(void); +int eeh_phb_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); +void eeh_handle_normal_event_work(struct work_struct *work); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20..61884d9398bf 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -138,6 +138,10 @@ struct pci_controller { /* iommu_ops support */ struct iommu_device iommu; + + bool eeh_in_progress; + struct list_head eeh_eventlist; + spinlock_t eeh_eventlist_lock; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 733fb290f4b7..12536d892826 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -579,11 +579,6 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * bridges. */ eeh_pe_mark_isolated(pe); - - /* Most EEH events are due to device driver bugs. Having -* a stack trace will help the device-driver authors figure -* out what happened. So print that out. -*/ pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", __func__, pe->phb->global_number, pe->addr); eeh_send_failure_event(pe); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index cdf2de0eba57..49f8b99dfb25 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -12,12 +12,17 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +static atomic_t eeh_wu_id = ATOMIC_INIT(0); + struct eeh_rmv_data { struct list_head removed_vf_list; int removed_dev_count; @@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_re
[RFC PATCH v2 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery
Based on the original work from Sam Bobroff. Give a unique ID to each recovery event, to ease log parsing and prepare for parallel recovery. Also add some new messages with a very simple format that may be useful to log-parsers. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh_event.h | 3 +- arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 42 +++--- arch/powerpc/kernel/eeh_driver.c | 189 +++ arch/powerpc/kernel/eeh_event.c | 12 +- include/linux/mmzone.h | 2 +- 6 files changed, 147 insertions(+), 103 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index dadde7d52f46..a1fe736bc4cf 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -17,13 +17,14 @@ struct eeh_event { struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ + unsigned intid; /* Event ID */ }; int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); -void eeh_handle_normal_event(struct eeh_pe *pe); +void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index d9fcff575027..5b82e76dbd19 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb); void eeh_addr_cache_insert_dev(struct pci_dev *dev); void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); -void eeh_slot_error_detail(struct eeh_pe *pe, int severity); +void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6907722c6c1e..733fb290f4b7 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -194,7 +194,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked); * for the indicated PCI device, and puts them into a buffer * for RTAS error logging. */ -static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) +static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev, + char *buf, size_t len) { u32 cfg; int cap, i; @@ -204,27 +205,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); - pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", + pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n", + event_id, edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, ); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); - pr_warn("EEH: PCI device/vendor: %08x\n", cfg); + pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg); eeh_ops->read_config(edev, PCI_COMMAND, 4, ); n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); - pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); + pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg); /* Gather bridge-specific registers */ if (edev->mode & EEH_DEV_BRIDGE) { eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, ); n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); - pr_warn("EEH: Bridge secondary status: %04x\n", cfg); + pr_warn("EEH(%u): Bridge secondary status: %04x\n", + event_id, cfg); eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, ); n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); - pr_warn("EEH: Bridge control: %04x\n", cfg); + pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg); } /* Dump out the PCI-X command and status regs */ @@ -232,18 +235,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) if (cap) { eeh_ops->
[RFC PATCH v2 1/3] powerpc/eeh: Synchronization for safety
Based on the original work from Sam Bobroff. There is currently little synchronization between EEH error detection (eeh_dev_check_failure()), EEH error recovery (eeh_handle_{normal,special}_event()) and the PCI subsystem (device addition and removal), and so there are race conditions that lead to crashes (often access to free'd memory or LIST_POISON). However, a solution must consider: - EEH error detection can occur in interrupt context, which prevents the use of a mutex. - EEH recovery may need to sleep, which prevents the use of a spinlock. - EEH recovery uses PCI operations that may require the PCI rescan/remove lock and/or device lock to be held - PCI operations may hold the rescan/remove and/or device lock when calling into EEH functions. - Device driver callbacks may perform arbitrary PCI operations during recovery, including device removal. In this patch the existing mutex and spinlock are combined with the EEH_PE_RECOVERING flag to provide some assurances that are then used to reduce the race conditions. The fields to be protected are the ones that provide the structure of the trees of struct eeh_pe that are held for each PHB: the parent pointer and child lists and the list of struct eeh_dev, as well as the pe and pdev pointers within struct eeh_dev. The existing way of using EEH_PE_RECOVERING is kept and slightly extended: No struct eeh_pe will be removed while it has the flag set on it. Additionally, when adding new PEs, they are marked EEH_PE_RECOVERING if their parent PE is marked: this allows the recovery thread to assume that all PEs underneath the one it's processing will continue to exist during recovery. Both the mutex and spinlock are held while any protected field is changed or a PE is deleted, so holding either of them (elsewhere) will keep them stable and safe to access. Additionally, if EEH_PE_RECOVERING is set on a PE then the locks can be released and re-acquired safely, as long as the protected fields aren't used while no locks are held. This is used during recovery to release locks for long sleeps (i.e. during eeh_wait_state() when we may sleep up to 5 minutes), or to maintain lock ordering. The spinlock is used in error detection (which cannot use a mutex, see above) and also where it's possible that the mutex is already held. The mutex is used in areas that don't have that restriction, and where blocking may be required. Care must be taken when ordering these locks against the PCI rescan/remove lock and the device locks to avoid deadlocking. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h | 12 +- arch/powerpc/kernel/eeh.c| 112 ++-- arch/powerpc/kernel/eeh_driver.c | 288 ++- arch/powerpc/kernel/eeh_pe.c | 30 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 10 +- 10 files changed, 365 insertions(+), 116 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 514dd056c2c8..95708c801f27 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -233,7 +233,7 @@ extern int eeh_subsystem_flags; extern u32 eeh_max_freezes; extern bool eeh_debugfs_no_recover; extern struct eeh_ops *eeh_ops; -extern raw_spinlock_t confirm_error_lock; +extern raw_spinlock_t eeh_pe_tree_spinlock; static inline void eeh_add_flag(int flag) { @@ -257,12 +257,12 @@ static inline bool eeh_enabled(void) static inline void eeh_serialize_lock(unsigned long *flags) { - raw_spin_lock_irqsave(_error_lock, *flags); + raw_spin_lock_irqsave(_pe_tree_spinlock, *flags); } static inline void eeh_serialize_unlock(unsigned long flags) { - raw_spin_unlock_irqrestore(_error_lock, flags); + raw_spin_unlock_irqrestore(_pe_tree_spinlock, flags); } static inline bool eeh_state_active(int state) @@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); } +void eeh_recovery_lock(void); +void eeh_recovery_unlock(void); +void eeh_recovery_must_be_locked(void); + typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); -int eeh_wait_state(struct eeh_pe *pe, int max_wait); +int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
[RFC PATCH v2 0/3] Asynchronous EEH recovery
Hi, EEH recovery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv with 9 network cards, Where 2 cards installed on one PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs in total. I see approximately 33% reduction in time taken in EEH recovery. These patches were originally posted as separate RFCs by Sam, And I rebased and posted these patches almost a year back, I stopped pursuing these patches as I was not able test this on powernv, Due to the issues in drivers of cards I was testing this on, Which are now resolved. Since I am re-posting this after long time, Posting this as a fresh RFC, Please comment. Thanks. V2: * Since we now have event list per phb, Have per phb event list lock. * Appropriate names given to the locks. * Remove stale comments (few more to be removed). * Initialize event_id to 0 instead of 1. * And some cosmetic changes. Ganesh Goudar (3): powerpc/eeh: Synchronization for safety powerpc/eeh: Provide a unique ID for each EEH recovery powerpc/eeh: Asynchronous recovery arch/powerpc/include/asm/eeh.h | 13 +- arch/powerpc/include/asm/eeh_event.h | 10 +- arch/powerpc/include/asm/pci-bridge.h| 4 + arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 159 +++-- arch/powerpc/kernel/eeh_driver.c | 580 +++ arch/powerpc/kernel/eeh_event.c | 75 ++- arch/powerpc/kernel/eeh_pe.c | 34 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 10 +- include/linux/mmzone.h | 2 +- 15 files changed, 693 insertions(+), 225 deletions(-) -- 2.40.1
Re: [RFC 0/3] Asynchronous EEH recovery
On 6/13/23 8:06 AM, Oliver O'Halloran wrote: On Tue, Jun 13, 2023 at 11:44 AM Ganesh Goudar wrote: Hi, EEH recovery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv with 9 network cards, Where 2 cards installed on one PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs in total. I see approximately 33% reduction in time taken in EEH recovery. These patches were originally posted as separate RFCs by Sam, And I rebased and posted these patches almost a year back, I stopped pursuing these patches as I was not able test this on powernv, Due to the issues in drivers of cards I was testing this on, Which are now resolved. Since I am re-posting this after long time, Posting this as a fresh RFC, Please comment. What changes have you made since the last time you posted this series? If the patches are the same then the comments I posted last time still apply. Hi Oliver, You asked about the way we are testing this on powervm, You expressed concerns about having this on powernv, suggested to have this feature just for powervm for now, and also expressed concerns on having two locks. On powervm using two port card we are instantiating 64 VFS, for an lpar and injecting the error on the bus from phyp, to observe the behavior. I was able to test this on powernv with 16 PFs from 8 cards installed on separate PHBs, Where I saw considerable performance improvement. Regarding two locks idea, I may not have tested it for all scenarios, So far I have not faced any issue, Are you suggesting a different approach. Thanks
[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery
Based on the original work from Sam Bobroff. Give a unique ID to each recovery event, to ease log parsing and prepare for parallel recovery. Also add some new messages with a very simple format that may be useful to log-parsers. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh_event.h | 3 +- arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 42 +++--- arch/powerpc/kernel/eeh_driver.c | 189 +++ arch/powerpc/kernel/eeh_event.c | 12 +- include/linux/mmzone.h | 2 +- 6 files changed, 147 insertions(+), 103 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index dadde7d52f46..a1fe736bc4cf 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -17,13 +17,14 @@ struct eeh_event { struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ + unsigned intid; /* Event ID */ }; int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); -void eeh_handle_normal_event(struct eeh_pe *pe); +void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index d9fcff575027..5b82e76dbd19 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb); void eeh_addr_cache_insert_dev(struct pci_dev *dev); void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); -void eeh_slot_error_detail(struct eeh_pe *pe, int severity); +void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 2c90c37524ed..148d5df0e606 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked); * for the indicated PCI device, and puts them into a buffer * for RTAS error logging. */ -static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) +static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev, + char *buf, size_t len) { u32 cfg; int cap, i; @@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); - pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", + pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n", + event_id, edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, ); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); - pr_warn("EEH: PCI device/vendor: %08x\n", cfg); + pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg); eeh_ops->read_config(edev, PCI_COMMAND, 4, ); n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); - pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); + pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg); /* Gather bridge-specific registers */ if (edev->mode & EEH_DEV_BRIDGE) { eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, ); n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); - pr_warn("EEH: Bridge secondary status: %04x\n", cfg); + pr_warn("EEH(%u): Bridge secondary status: %04x\n", + event_id, cfg); eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, ); n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); - pr_warn("EEH: Bridge control: %04x\n", cfg); + pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg); } /* Dump out the PCI-X command and status regs */ @@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) if (cap) { eeh_ops->
[RFC 3/3] powerpc/eeh: Asynchronous recovery
Based on the original work from Sam Bobroff. Currently, EEH recovery is entirely serialized and takes place within a single kernel thread. This can cause recovery to take a long time when there are many devices. To shorten recovery time, this change allows recovery to proceed in parallel in two ways: - Each PHB is given it's own recovery event queue and can be recovered independently from other PHBs. - Driver handlers are called in parallel, but with the constraint that handlers higher up (closer to the PHB) in the PE hierarchy must be called before those lower down. To maintain the constraint, above, the driver handlers are called by traversing the tree of affected PEs from the top, stopping to call handlers (in parallel) when a PE with devices is discovered. When the calls for that PE are complete, traversal continues at each child PE. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h| 1 + arch/powerpc/include/asm/eeh_event.h | 7 + arch/powerpc/include/asm/pci-bridge.h | 3 + arch/powerpc/kernel/eeh_driver.c | 323 +++--- arch/powerpc/kernel/eeh_event.c | 65 +++--- arch/powerpc/kernel/eeh_pe.c | 3 + 6 files changed, 288 insertions(+), 114 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index d0f09e691498..06d7dabdccfe 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */ #define EEH_DEV_SYSFS (1 << 9)/* Sysfs created*/ #define EEH_DEV_REMOVED(1 << 10) /* Removed permanently */ +#define EEH_DEV_RECOVERING (1 << 11) /* Recovering */ struct eeh_dev { int mode; /* EEH mode */ diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index a1fe736bc4cf..b21f49e87b7b 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -8,6 +8,8 @@ #define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ +#include + /* * structure holding pci controller data that describes a * change in the isolation status of a PCI slot. A pointer @@ -15,16 +17,21 @@ * callback. */ struct eeh_event { + struct work_struct work; struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ unsigned intid; /* Event ID */ }; +extern spinlock_t eeh_eventlist_lock; + int eeh_event_init(void); +int eeh_phb_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); +void eeh_handle_normal_event_work(struct work_struct *work); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 2aa3a091ef20..55a5ff9ae30b 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -138,6 +138,9 @@ struct pci_controller { /* iommu_ops support */ struct iommu_device iommu; + + bool eeh_in_progress; + struct list_head eeh_eventlist; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index cdf2de0eba57..a484d6ef33a1 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -12,12 +12,17 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +static atomic_t eeh_wu_id = ATOMIC_INIT(0); + struct eeh_rmv_data { struct list_head removed_vf_list; int removed_dev_count; @@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id, +unsigned int id, struct pci_dev *, struct pci_driver *); static void eeh_pe_report_pdev(unsigned int event_id, - struct pci_dev *pdev, eeh_report_fn fn, + unsigned int id, + struct pci_dev *pdev, + const char *fn_name, eeh_report_fn fn, enum pci_ers_result *result, - const char *handler_name) + bool late, bool removed, bool passed) { - struct eeh_dev *edev; struct pci_driv
[RFC 1/3] powerpc/eeh: Synchronization for safety
Based on the original work from Sam Bobroff. There is currently little synchronization between EEH error detection (eeh_dev_check_failure()), EEH error recovery (eeh_handle_{normal,special}_event()) and the PCI subsystem (device addition and removal), and so there are race conditions that lead to crashes (often access to free'd memory or LIST_POISON). However, a solution must consider: - EEH error detection can occur in interrupt context, which prevents the use of a mutex. - EEH recovery may need to sleep, which prevents the use of a spinlock. - EEH recovery uses PCI operations that may require the PCI rescan/remove lock and/or device lock to be held - PCI operations may hold the rescan/remove and/or device lock when calling into EEH functions. - Device driver callbacks may perform arbitrary PCI operations during recovery, including device removal. In this patch the existing mutex and spinlock are combined with the EEH_PE_RECOVERING flag to provide some assurances that are then used to reduce the race conditions. The fields to be protected are the ones that provide the structure of the trees of struct eeh_pe that are held for each PHB: the parent pointer and child lists and the list of struct eeh_dev, as well as the pe and pdev pointers within struct eeh_dev. The existing way of using EEH_PE_RECOVERING is kept and slightly extended: No struct eeh_pe will be removed while it has the flag set on it. Additionally, when adding new PEs, they are marked EEH_PE_RECOVERING if their parent PE is marked: this allows the recovery thread to assume that all PEs underneath the one it's processing will continue to exist during recovery. Both the mutex and spinlock are held while any protected field is changed or a PE is deleted, so holding either of them (elsewhere) will keep them stable and safe to access. Additionally, if EEH_PE_RECOVERING is set on a PE then the locks can be released and re-acquired safely, as long as the protected fields aren't used while no locks are held. This is used during recovery to release locks for long sleeps (i.e. during eeh_wait_state() when we may sleep up to 5 minutes), or to maintain lock ordering. The spinlock is used in error detection (which cannot use a mutex, see above) and also where it's possible that the mutex is already held. The mutex is used in areas that don't have that restriction, and where blocking may be required. Care must be taken when ordering these locks against the PCI rescan/remove lock and the device locks to avoid deadlocking. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h | 6 +- arch/powerpc/kernel/eeh.c| 112 ++-- arch/powerpc/kernel/eeh_driver.c | 288 ++- arch/powerpc/kernel/eeh_pe.c | 30 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 10 +- 10 files changed, 365 insertions(+), 110 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 514dd056c2c8..d0f09e691498 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); } +void eeh_recovery_lock(void); +void eeh_recovery_unlock(void); +void eeh_recovery_must_be_locked(void); + typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); -int eeh_wait_state(struct eeh_pe *pe, int max_wait); +int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..2c90c37524ed 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover; /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; -/* Lock to avoid races due to multiple reports of an error */ +/* + * confirm_error_lock and eeh_dev_mutex are used together to provide + * safety during EEH operations. + * + * Generally, the spinlock is used in error detection where it's not possible + * to use a mutex or where there is potential to deadlock with the mutex, and + * the mutex is used during recovery and other PCI related operations. One must + * be held when reading and both must be held when making changes to the + * protected fields: eeh_pe.parent
[RFC 0/3] Asynchronous EEH recovery
Hi, EEH recovery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. On powervm with 64 VFs from same PHB, I see approximately 48% reduction in time taken in EEH recovery. On powernv with 9 network cards, Where 2 cards installed on one PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs in total. I see approximately 33% reduction in time taken in EEH recovery. These patches were originally posted as separate RFCs by Sam, And I rebased and posted these patches almost a year back, I stopped pursuing these patches as I was not able test this on powernv, Due to the issues in drivers of cards I was testing this on, Which are now resolved. Since I am re-posting this after long time, Posting this as a fresh RFC, Please comment. Thanks. Ganesh Goudar (3): powerpc/eeh: Synchronization for safety powerpc/eeh: Provide a unique ID for each EEH recovery powerpc/eeh: Asynchronous recovery arch/powerpc/include/asm/eeh.h | 7 +- arch/powerpc/include/asm/eeh_event.h | 10 +- arch/powerpc/include/asm/pci-bridge.h| 3 + arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 154 +++-- arch/powerpc/kernel/eeh_driver.c | 580 +++ arch/powerpc/kernel/eeh_event.c | 71 ++- arch/powerpc/kernel/eeh_pe.c | 33 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 10 +- include/linux/mmzone.h | 2 +- 15 files changed, 687 insertions(+), 214 deletions(-) -- 2.40.1
[PATCH] powerpc/eeh: Set channel state after notifying the drivers
When a PCI error is encountered 6th time in an hour we set the channel state to perm_failure and notify the driver about the permanent failure. However, after upstream commit 38ddc011478e ("powerpc/eeh: Make permanently failed devices non-actionable"), EEH handler stops calling any routine once the device is marked as permanent failure. This issue can lead to fatal consequences like kernel hang with certain PCI devices. Following log is observed with lpfc driver, with and without this change, Without this change kernel hangs, If PCI error is encountered 6 times for a device in an hour. Without the change EEH: Beginning: 'error_detected(permanent failure)' PCI 0132:60:00.0#60: EEH: not actionable (1,1,1) PCI 0132:60:00.1#60: EEH: not actionable (1,1,1) EEH: Finished:'error_detected(permanent failure)' With the change EEH: Beginning: 'error_detected(permanent failure)' EEH: Invoking lpfc->error_detected(permanent failure) EEH: lpfc driver reports: 'disconnect' EEH: Invoking lpfc->error_detected(permanent failure) EEH: lpfc driver reports: 'disconnect' EEH: Finished:'error_detected(permanent failure)' To fix the issue, set channel state to permanent failure after notifying the drivers. Fixes: 38ddc011478e ("powerpc/eeh: Make permanently failed devices non-actionable") Suggested-by: Mahesh Salgaonkar Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/eeh_driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index f279295179bd..438568a472d0 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1065,10 +1065,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_slot_error_detail(pe, EEH_LOG_PERM); /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_set_irq_state(pe, false); eeh_pe_report("error_detected(permanent failure)", pe, eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); /* Mark the PE to be removed permanently */ eeh_pe_state_mark(pe, EEH_PE_REMOVED); @@ -1185,10 +1185,10 @@ void eeh_handle_special_event(void) /* Notify all devices to be down */ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_pe_report( "error_detected(permanent failure)", pe, eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); pci_lock_rescan_remove(); list_for_each_entry(hose, _list, list_node) { -- 2.39.1
Re: [PATCH v2] powerpc/mce: log the error for all unrecoverable errors
On 1/31/23 4:59 PM, Michael Ellerman wrote: Ganesh Goudar writes: For all unrecoverable errors we are missing to log the error, Since machine_check_log_err() is not getting called for unrecoverable errors. Raise irq work in save_mce_event() for unrecoverable errors, So that we log the error from MCE event handling block in timer handler. But the patch also removes the irq work raise from machine_check_ue_event(). That's currently done unconditionally, regardless of the disposition. So doesn't this change also drop logging of recoverable UEs? Maybe that's OK, but the change log should explain it. Yes, its ok, exception vector code will do that for recoverable errors, ill explain this in commit message. Log without this change MCE: CPU27: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4] MCE: CPU27: Initiator CPU MCE: CPU27: Unknown Log with this change MCE: CPU24: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48] MCE: CPU24: Initiator CPU MCE: CPU24: Unknown RTAS: event: 5, Type: Platform Error (224), Severity: 3 Signed-off-by: Ganesh Goudar Reviewed-by: Mahesh Salgaonkar --- V2: Rephrasing the commit message. --- arch/powerpc/kernel/mce.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6c5d30fba766..a1cb2172eb7b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + /* +* Raise irq work, So that we don't miss to log the error for +* unrecoverable errors. +*/ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); + if (!addr) return; @@ -235,7 +242,6 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ This comment is meaningless without the function call it's commenting about, ie. the comment should be removed too. ok. Thanks.
[PATCH v3] powerpc/mce: log the error for all unrecoverable errors
For all unrecoverable errors we are missing to log the error, Since machine_check_log_err() is not getting called for unrecoverable errors. machine_check_log_err() is called from deferred handler, To run deferred handlers we have to do irq work raise from the exception handler. For recoverable errors exception vector code takes care of running deferred handlers. For unrecoverable errors raise irq work in save_mce_event(), So that we log the error from MCE deferred handler. Log without this change MCE: CPU27: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4] MCE: CPU27: Initiator CPU MCE: CPU27: Unknown Log with this change MCE: CPU24: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48] MCE: CPU24: Initiator CPU MCE: CPU24: Unknown RTAS: event: 5, Type: Platform Error (224), Severity: 3 Signed-off-by: Ganesh Goudar Reviewed-by: Mahesh Salgaonkar --- V3: Rephrasing the commit message. --- arch/powerpc/kernel/mce.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6c5d30fba766..219f28637a3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + /* +* Raise irq work, So that we don't miss to log the error for +* unrecoverable errors. +*/ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); + if (!addr) return; @@ -233,9 +240,6 @@ static void machine_check_ue_event(struct machine_check_event *evt) } memcpy(_paca->mce_info->mce_ue_event_queue[index], evt, sizeof(*evt)); - - /* Queue work to process this event later. */ - mce_irq_work_queue(); } /* -- 2.39.1
[PATCH v2] powerpc/mce: log the error for all unrecoverable errors
For all unrecoverable errors we are missing to log the error, Since machine_check_log_err() is not getting called for unrecoverable errors. Raise irq work in save_mce_event() for unrecoverable errors, So that we log the error from MCE event handling block in timer handler. Log without this change MCE: CPU27: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4] MCE: CPU27: Initiator CPU MCE: CPU27: Unknown Log with this change MCE: CPU24: machine check (Severe) Real address Load/Store (foreign/control memory) [Not recovered] MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48] MCE: CPU24: Initiator CPU MCE: CPU24: Unknown RTAS: event: 5, Type: Platform Error (224), Severity: 3 Signed-off-by: Ganesh Goudar Reviewed-by: Mahesh Salgaonkar --- V2: Rephrasing the commit message. --- arch/powerpc/kernel/mce.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6c5d30fba766..a1cb2172eb7b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + /* +* Raise irq work, So that we don't miss to log the error for +* unrecoverable errors. +*/ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); + if (!addr) return; @@ -235,7 +242,6 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - mce_irq_work_queue(); } /* -- 2.38.1
[PATCH] powerpc/mce: log the error for all unrecoverable errors
machine_check_log_err() is not getting called for all unrecoverable errors, And we are missing to log the error. Raise irq work in save_mce_event() for unrecoverable errors, So that we log the error from MCE event handling block in timer handler. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6c5d30fba766..a1cb2172eb7b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + /* +* Raise irq work, So that we don't miss to log the error for +* unrecoverable errors. +*/ + if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED) + mce_irq_work_queue(); + if (!addr) return; @@ -235,7 +242,6 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - mce_irq_work_queue(); } /* -- 2.37.1
[PATCH v3] powerpc/pseries/mce: Avoid instrumentation in realmode
Part of machine check error handling is done in realmode, As of now instrumentation is not possible for any code that runs in realmode. When MCE is injected on KASAN enabled kernel, crash is observed, Hence force inline or mark no instrumentation for functions which can run in realmode, to avoid KASAN instrumentation. Signed-off-by: Ganesh Goudar --- v2: Force inline few more functions. v3: Adding noinstr to few functions instead of __always_inline. --- arch/powerpc/include/asm/hw_irq.h| 8 arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/include/asm/rtas.h | 4 ++-- arch/powerpc/kernel/rtas.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 983551859891..c4d542b4a623 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void) #ifdef CONFIG_PPC64 #include -static inline notrace unsigned long irq_soft_mask_return(void) +noinstr static unsigned long irq_soft_mask_return(void) { unsigned long flags; @@ -128,7 +128,7 @@ static inline notrace unsigned long irq_soft_mask_return(void) * for the critical section and as a clobber because * we changed paca->irq_soft_mask */ -static inline notrace void irq_soft_mask_set(unsigned long mask) +noinstr static void irq_soft_mask_set(unsigned long mask) { /* * The irq mask must always include the STD bit if any are set. @@ -155,7 +155,7 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) : "memory"); } -static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) +noinstr static unsigned long irq_soft_mask_set_return(unsigned long mask) { unsigned long flags; @@ -191,7 +191,7 @@ static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) return flags; } -static inline unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return irq_soft_mask_return(); } diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 8069dbc4b8d1..090895051712 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs *regs) return search_kernel_soft_mask_table(regs->nip); } -static inline void srr_regs_clobbered(void) +static __always_inline void srr_regs_clobbered(void) { local_paca->srr_valid = 0; local_paca->hsrr_valid = 0; diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 00531af17ce0..52d29d664fdf 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C') static -inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) { return be16_to_cpu(sect->id); } static -inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) { return be16_to_cpu(sect->length); } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 693133972294..f9d78245c0e8 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -48,7 +48,7 @@ /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); -static inline void do_enter_rtas(unsigned long args) +static __always_inline void do_enter_rtas(unsigned long args) { unsigned long msr; @@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf) #endif -static void +noinstr static void va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, va_list list) { -- 2.37.1
Re: [PACTH v2] powerpc/pseries/mce: Avoid instrumentation in realmode
On 9/7/22 09:49, Nicholas Piggin wrote: On Mon Sep 5, 2022 at 4:38 PM AEST, Ganesh Goudar wrote: Part of machine check error handling is done in realmode, As of now instrumentation is not possible for any code that runs in realmode. When MCE is injected on KASAN enabled kernel, crash is observed, Hence force inline or mark no instrumentation for functions which can run in realmode, to avoid KASAN instrumentation. Signed-off-by: Ganesh Goudar --- v2: Force inline few more functions. --- arch/powerpc/include/asm/hw_irq.h| 8 arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/include/asm/rtas.h | 4 ++-- arch/powerpc/kernel/rtas.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 26ede09c521d..3264991fe524 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void) #ifdef CONFIG_PPC64 #include -static inline notrace unsigned long irq_soft_mask_return(void) +static __always_inline notrace unsigned long irq_soft_mask_return(void) { return READ_ONCE(local_paca->irq_soft_mask); } @@ -121,7 +121,7 @@ static inline notrace unsigned long irq_soft_mask_return(void) * for the critical section and as a clobber because * we changed paca->irq_soft_mask */ -static inline notrace void irq_soft_mask_set(unsigned long mask) +static __always_inline notrace void irq_soft_mask_set(unsigned long mask) { /* * The irq mask must always include the STD bit if any are set. This doesn't give a reason why it's __always_inline, and having the notrace attribute makes it possibly confusing. I think it would be easy for someone to break without realising. Could you add a noinstr to these instead / as well? Yeah we can add noinstr. Missed to see your comment, Sorry for the delayed reply What about adding a 'realmode' function annotation that includes noinstr? You mean to define a new function annotation?
Re: [RFC 0/3] Asynchronous EEH recovery
On 9/2/22 05:49, Jason Gunthorpe wrote: On Tue, Aug 16, 2022 at 08:57:13AM +0530, Ganesh Goudar wrote: Hi, EEH reocvery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. How did you test this? This is tested on SRIOV VFs. I understand that VFIO on 6.0 does not work at all on power? I am waiting for power maintainers to pick up this series to fix it: https://lore.kernel.org/kvm/20220714081822.3717693-1-...@ozlabs.ru/ Jason
[PACTH v2] powerpc/pseries/mce: Avoid instrumentation in realmode
Part of machine check error handling is done in realmode, As of now instrumentation is not possible for any code that runs in realmode. When MCE is injected on KASAN enabled kernel, crash is observed, Hence force inline or mark no instrumentation for functions which can run in realmode, to avoid KASAN instrumentation. Signed-off-by: Ganesh Goudar --- v2: Force inline few more functions. --- arch/powerpc/include/asm/hw_irq.h| 8 arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/include/asm/rtas.h | 4 ++-- arch/powerpc/kernel/rtas.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 26ede09c521d..3264991fe524 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void) #ifdef CONFIG_PPC64 #include -static inline notrace unsigned long irq_soft_mask_return(void) +static __always_inline notrace unsigned long irq_soft_mask_return(void) { return READ_ONCE(local_paca->irq_soft_mask); } @@ -121,7 +121,7 @@ static inline notrace unsigned long irq_soft_mask_return(void) * for the critical section and as a clobber because * we changed paca->irq_soft_mask */ -static inline notrace void irq_soft_mask_set(unsigned long mask) +static __always_inline notrace void irq_soft_mask_set(unsigned long mask) { /* * The irq mask must always include the STD bit if any are set. @@ -144,7 +144,7 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) barrier(); } -static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) +static __always_inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) { unsigned long flags = irq_soft_mask_return(); @@ -162,7 +162,7 @@ static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) return flags; } -static inline unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return irq_soft_mask_return(); } diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 8069dbc4b8d1..090895051712 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs *regs) return search_kernel_soft_mask_table(regs->nip); } -static inline void srr_regs_clobbered(void) +static __always_inline void srr_regs_clobbered(void) { local_paca->srr_valid = 0; local_paca->hsrr_valid = 0; diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 00531af17ce0..52d29d664fdf 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C') static -inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) { return be16_to_cpu(sect->id); } static -inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) { return be16_to_cpu(sect->length); } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 693133972294..f9d78245c0e8 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -48,7 +48,7 @@ /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); -static inline void do_enter_rtas(unsigned long args) +static __always_inline void do_enter_rtas(unsigned long args) { unsigned long msr; @@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf) #endif -static void +noinstr static void va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, va_list list) { -- 2.37.1
[PATCH] powerpc/pseries/mce: Avoid instrumentation in realmode
Part of machine check error handling is done in realmode, As of now instrumentation is not possible for any code that runs in realmode. When MCE is injected on KASAN enabled kernel, crash is observed, Hence force inline or mark no instrumentation for functions which can run in realmode to avoid KASAN instrumentation. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/include/asm/rtas.h | 4 ++-- arch/powerpc/kernel/rtas.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 8069dbc4b8d1..090895051712 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs *regs) return search_kernel_soft_mask_table(regs->nip); } -static inline void srr_regs_clobbered(void) +static __always_inline void srr_regs_clobbered(void) { local_paca->srr_valid = 0; local_paca->hsrr_valid = 0; diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 00531af17ce0..52d29d664fdf 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C') static -inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) { return be16_to_cpu(sect->id); } static -inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) +__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect) { return be16_to_cpu(sect->length); } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 693133972294..f9d78245c0e8 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -48,7 +48,7 @@ /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); -static inline void do_enter_rtas(unsigned long args) +static __always_inline void do_enter_rtas(unsigned long args) { unsigned long msr; @@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf) #endif -static void +noinstr static void va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, va_list list) { -- 2.37.1
Re: [6.0-rc1] Kernel crash while running MCE tests
On 8/22/22 11:01, Sachin Sant wrote: On 19-Aug-2022, at 10:12 AM, Ganesh wrote We'll have to make sure everything get_pseries_errorlog() is either forced inline, or marked noinstr. Making the following functions always_inline and noinstr is fixing the issue. __always_inline pseries_errorlog_id() __always_inline pseries_errorlog_length() __always_inline do_enter_rtas() __always_inline srr_regs_clobbered() noinstr va_rtas_call_unlocked() Shall I post the patch? Yes, thanks. I can help with testing. Sure, thanks.
Re: [6.0-rc1] Kernel crash while running MCE tests
On 8/22/22 11:19, Michael Ellerman wrote: So I guess the compiler has decided not to inline it (why?!), and it is not marked noinstr, so it gets KASAN instrumentation which crashes in real mode. We'll have to make sure everything get_pseries_errorlog() is either forced inline, or marked noinstr. Making the following functions always_inline and noinstr is fixing the issue. __always_inline pseries_errorlog_id() __always_inline pseries_errorlog_length() __always_inline do_enter_rtas() __always_inline srr_regs_clobbered() noinstr va_rtas_call_unlocked() Why do we need it? Because of fwnmi_release_errinfo()? Yes. Shall I post the patch? Yeah. cheers
Re: [6.0-rc1] Kernel crash while running MCE tests
On 8/17/22 11:28, Michael Ellerman wrote: Sachin Sant writes: Following crash is seen while running powerpc/mce subtest on a Power10 LPAR. 1..1 # selftests: powerpc/mce: inject-ra-err [ 155.240591] BUG: Unable to handle kernel data access on read at 0xc00e00022d55b503 [ 155.240618] Faulting instruction address: 0xc06f1f0c [ 155.240627] Oops: Kernel access of bad area, sig: 11 [#1] [ 155.240633] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries [ 155.240642] Modules linked in: dm_mod mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 bonding rfkill tls ip_set nf_tables nfnetlink sunrpc binfmt_misc pseries_rng drm drm_panel_orientation_quirks xfs libcrc32c sd_mod t10_pi sr_mod crc64_rocksoft_generic cdrom crc64_rocksoft crc64 sg ibmvscsi ibmveth scsi_transport_srp xts vmx_crypto fuse [ 155.240750] CPU: 4 PID: 3645 Comm: inject-ra-err Not tainted 6.0.0-rc1 #2 [ 155.240761] NIP: c06f1f0c LR: c00630d0 CTR: [ 155.240768] REGS: c000ff887890 TRAP: 0300 Not tainted (6.0.0-rc1) [ 155.240776] MSR: 80001003 CR: 48002828 XER: ^ MMU is off, aka. real mode. [ 155.240792] CFAR: c00630cc DAR: c00e00022d55b503 DSISR: 4000 IRQMASK: 3 [ 155.240792] GPR00: c00630d0 c000ff887b30 c44afe00 c0116aada818 [ 155.240792] GPR04: 4d43 0008 c00630d0 004d4249 [ 155.240792] GPR08: 0001 18022d55b503 a80e 0348 [ 155.240792] GPR12: c000b700 [ 155.240792] GPR16: [ 155.240792] GPR20: 1b30 [ 155.240792] GPR24: 7fff8dad 7fff8dacf6d8 7fffd1551e98 1001fce8 [ 155.240792] GPR28: c0116aada888 c0116aada800 4d43 c0116aada818 [ 155.240885] NIP [c06f1f0c] __asan_load2+0x5c/0xe0 [ 155.240898] LR [c00630d0] pseries_errorlog_id+0x20/0x40 [ 155.240910] Call Trace: [ 155.240914] [c000ff887b50] [c00630d0] pseries_errorlog_id+0x20/0x40 [ 155.240925] [c000ff887b80] [c15595c8] get_pseries_errorlog+0xa8/0x110 get_pseries_errorlog() is marked noinstr. And pseries_errorlog_id() is: static inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect) { return be16_to_cpu(sect->id); } So I guess the compiler has decided not to inline it (why?!), and it is not marked noinstr, so it gets KASAN instrumentation which crashes in real mode. We'll have to make sure everything get_pseries_errorlog() is either forced inline, or marked noinstr. Making the following functions always_inline and noinstr is fixing the issue. __always_inline pseries_errorlog_id() __always_inline pseries_errorlog_length() __always_inline do_enter_rtas() __always_inline srr_regs_clobbered() noinstr va_rtas_call_unlocked() Shall I post the patch?
[RFC 3/3] powerpc/eeh: Asynchronous recovery
Based on the original work from Sam Bobroff. Currently, EEH recovery is entirely serialized and takes place within a single kernel thread. This can cause recovery to take a long time when there are many devices. To shorten recovery time, this change allows recovery to proceed in parallel in two ways: - Each PHB is given it's own recovery event queue and can be recovered independently from other PHBs. - Driver handlers are called in parallel, but with the constraint that handlers higher up (closer to the PHB) in the PE hierarchy must be called before those lower down. To maintain the constraint, above, the driver handlers are called by traversing the tree of affected PEs from the top, stopping to call handlers (in parallel) when a PE with devices is discovered. When the calls for that PE are complete, traversal continues at each child PE. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h| 1 + arch/powerpc/include/asm/eeh_event.h | 7 + arch/powerpc/include/asm/pci-bridge.h | 3 + arch/powerpc/kernel/eeh_driver.c | 323 +++--- arch/powerpc/kernel/eeh_event.c | 65 +++--- arch/powerpc/kernel/eeh_pe.c | 3 + 6 files changed, 288 insertions(+), 114 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index f659c0433de5..2728aee5cb0b 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */ #define EEH_DEV_SYSFS (1 << 9)/* Sysfs created*/ #define EEH_DEV_REMOVED(1 << 10) /* Removed permanently */ +#define EEH_DEV_RECOVERING (1 << 11) /* Recovering */ struct eeh_dev { int mode; /* EEH mode */ diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index a1fe736bc4cf..b21f49e87b7b 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -8,6 +8,8 @@ #define ASM_POWERPC_EEH_EVENT_H #ifdef __KERNEL__ +#include + /* * structure holding pci controller data that describes a * change in the isolation status of a PCI slot. A pointer @@ -15,16 +17,21 @@ * callback. */ struct eeh_event { + struct work_struct work; struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ unsigned intid; /* Event ID */ }; +extern spinlock_t eeh_eventlist_lock; + int eeh_event_init(void); +int eeh_phb_event(struct eeh_pe *pe); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); +void eeh_handle_normal_event_work(struct work_struct *work); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index c85f901227c9..74806009f50a 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -131,6 +131,9 @@ struct pci_controller { struct irq_domain *dev_domain; struct irq_domain *msi_domain; struct fwnode_handle*fwnode; + + bool eeh_in_progress; + struct list_head eeh_eventlist; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 894326cc4dfa..3abd5f2d146c 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -12,12 +12,17 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +static atomic_t eeh_wu_id = ATOMIC_INIT(0); + struct eeh_rmv_data { struct list_head removed_vf_list; int removed_dev_count; @@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id, +unsigned int id, struct pci_dev *, struct pci_driver *); static void eeh_pe_report_pdev(unsigned int event_id, - struct pci_dev *pdev, eeh_report_fn fn, + unsigned int id, + struct pci_dev *pdev, + const char *fn_name, eeh_report_fn fn, enum pci_ers_result *result, - const char *handler_name) + bool late, bool removed, bool passed) { -
[RFC 1/3] powerpc/eeh: Synchronization for safety
Based on the original work from Sam Bobroff. There is currently little synchronization between EEH error detection (eeh_dev_check_failure()), EEH error recovery (eeh_handle_{normal,special}_event()) and the PCI subsystem (device addition and removal), and so there are race conditions that lead to crashes (often access to free'd memory or LIST_POISON). However, a solution must consider: - EEH error detection can occur in interrupt context, which prevents the use of a mutex. - EEH recovery may need to sleep, which prevents the use of a spinlock. - EEH recovery uses PCI operations that may require the PCI rescan/remove lock and/or device lock to be held - PCI operations may hold the rescan/remove and/or device lock when calling into EEH functions. - Device driver callbacks may perform arbitrary PCI operations during recovery, including device removal. In this patch the existing mutex and spinlock are combined with the EEH_PE_RECOVERING flag to provide some assurances that are then used to reduce the race conditions. The fields to be protected are the ones that provide the structure of the trees of struct eeh_pe that are held for each PHB: the parent pointer and child lists and the list of struct eeh_dev, as well as the pe and pdev pointers within struct eeh_dev. The existing way of using EEH_PE_RECOVERING is kept and slightly extended: No struct eeh_pe will be removed while it has the flag set on it. Additionally, when adding new PEs, they are marked EEH_PE_RECOVERING if their parent PE is marked: this allows the recovery thread to assume that all PEs underneath the one it's processing will continue to exist during recovery. Both the mutex and spinlock are held while any protected field is changed or a PE is deleted, so holding either of them (elsewhere) will keep them stable and safe to access. Additionally, if EEH_PE_RECOVERING is set on a PE then the locks can be released and re-acquired safely, as long as the protected fields aren't used while no locks are held. This is used during recovery to release locks for long sleeps (i.e. during eeh_wait_state() when we may sleep up to 5 minutes), or to maintain lock ordering. The spinlock is used in error detection (which cannot use a mutex, see above) and also where it's possible that the mutex is already held. The mutex is used in areas that don't have that restriction, and where blocking may be required. Care must be taken when ordering these locks against the PCI rescan/remove lock and the device locks to avoid deadlocking. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh.h | 6 +- arch/powerpc/kernel/eeh.c| 112 ++-- arch/powerpc/kernel/eeh_driver.c | 287 ++- arch/powerpc/kernel/eeh_pe.c | 30 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_spapr_eeh.c| 10 +- 10 files changed, 364 insertions(+), 110 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 514dd056c2c8..f659c0433de5 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); } +void eeh_recovery_lock(void); +void eeh_recovery_unlock(void); +void eeh_recovery_must_be_locked(void); + typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); -int eeh_wait_state(struct eeh_pe *pe, int max_wait); +int eeh_wait_state(struct eeh_pe *pe, int max_waiti, bool unlock); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..2c90c37524ed 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover; /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; -/* Lock to avoid races due to multiple reports of an error */ +/* + * confirm_error_lock and eeh_dev_mutex are used together to provide + * safety during EEH operations. + * + * Generally, the spinlock is used in error detection where it's not possible + * to use a mutex or where there is potential to deadlock with the mutex, and + * the mutex is used during recovery and other PCI related operations. One must + * be held when reading and both must be held when making changes to the + * protected fields: eeh_pe.parent
[RFC 0/3] Asynchronous EEH recovery
Hi, EEH reocvery is currently serialized and these patches shorten the time taken for EEH recovery by making the recovery to run in parallel. The original author of these patches is Sam Bobroff, I have rebased and tested these patches. On powervm with 64 VFs and I see approximately 48% reduction in time taken in EEH recovery, Yet to be tested on powernv. These patches were originally posted as separate RFCs, I think posting them as single series would be more helpful, I know the patches are too big, I will try to logically divide in next iterations. Thanks Ganesh Goudar (3): powerpc/eeh: Synchronization for safety powerpc/eeh: Provide a unique ID for each EEH recovery powerpc/eeh: Asynchronous recovery arch/powerpc/include/asm/eeh.h | 7 +- arch/powerpc/include/asm/eeh_event.h | 10 +- arch/powerpc/include/asm/pci-bridge.h| 3 + arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 154 +++-- arch/powerpc/kernel/eeh_driver.c | 578 +++ arch/powerpc/kernel/eeh_event.c | 71 ++- arch/powerpc/kernel/eeh_pe.c | 33 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 12 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 5 +- arch/powerpc/platforms/pseries/pci_dlpar.c | 5 +- drivers/pci/hotplug/pnv_php.c| 5 +- drivers/pci/hotplug/rpadlpar_core.c | 2 + drivers/vfio/vfio_spapr_eeh.c| 10 +- 14 files changed, 685 insertions(+), 212 deletions(-) -- 2.37.1
[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery
Based on the original work from Sam Bobroff. Give a unique ID to each recovery event, to ease log parsing and prepare for parallel recovery. Also add some new messages with a very simple format that may be useful to log-parsers. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/eeh_event.h | 3 +- arch/powerpc/include/asm/ppc-pci.h | 2 +- arch/powerpc/kernel/eeh.c| 42 +++--- arch/powerpc/kernel/eeh_driver.c | 188 --- arch/powerpc/kernel/eeh_event.c | 12 +- 5 files changed, 146 insertions(+), 101 deletions(-) diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index dadde7d52f46..a1fe736bc4cf 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -17,13 +17,14 @@ struct eeh_event { struct list_headlist; /* to form event queue */ struct eeh_pe *pe;/* EEH PE */ + unsigned intid; /* Event ID */ }; int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); -void eeh_handle_normal_event(struct eeh_pe *pe); +void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe); void eeh_handle_special_event(void); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index f6cf0159024e..42d175af33cb 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb); void eeh_addr_cache_insert_dev(struct pci_dev *dev); void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); -void eeh_slot_error_detail(struct eeh_pe *pe, int severity); +void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 2c90c37524ed..148d5df0e606 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked); * for the indicated PCI device, and puts them into a buffer * for RTAS error logging. */ -static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) +static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev, + char *buf, size_t len) { u32 cfg; int cap, i; @@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); - pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", + pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n", + event_id, edev->pe->phb->global_number, edev->bdfn >> 8, PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn)); eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, ); n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); - pr_warn("EEH: PCI device/vendor: %08x\n", cfg); + pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg); eeh_ops->read_config(edev, PCI_COMMAND, 4, ); n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); - pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); + pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg); /* Gather bridge-specific registers */ if (edev->mode & EEH_DEV_BRIDGE) { eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, ); n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); - pr_warn("EEH: Bridge secondary status: %04x\n", cfg); + pr_warn("EEH(%u): Bridge secondary status: %04x\n", + event_id, cfg); eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, ); n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); - pr_warn("EEH: Bridge control: %04x\n", cfg); + pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg); } /* Dump out the PCI-X command and status regs */ @@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) if (cap) { eeh_ops->read_config(edev, cap, 4, ); n += scnpri
Re: [PATCH v3 RESEND 1/3] powerpc/pseries: Parse control memory access error
On 1/7/22 19:44, Ganesh Goudar wrote: Add support to parse and log control memory access error for pseries. These changes are made according to PAPR v2.11 10.3.2.2.12. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 36 1 file changed, 32 insertions(+), 4 deletions(-) mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; Hi mpe, Any comments on this patch series?
[PATCH v5] powerpc/mce: Avoid using irq_work_queue() in realmode
In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- V2: * Use arch_irq_work_raise to raise decrementer interrupt. * Avoid having atomic variable. V3: * Fix build error. Reported by kernel test bot. V4: * Rename some functions and variables * Remove mces_to_process counter and add a flag to indicate there is a mce info to process. V5: * Fix the build warning, reported by kernel test robot. --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 13 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 60 +--- arch/powerpc/kernel/time.c | 2 + arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 32 + arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 53 insertions(+), 59 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index e821037f74f0..36d2f34aa352 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,6 +99,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..c9f0936bd3c9 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void mce_irq_work_queue(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_run_irq_context_handlers(void); +#else +static inline void mce_run_irq_context_handlers(void) { }; +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void set_mce_pending_irq_work(void); +void clear_mce_pending_irq_work(void); +#endif /* CONFIG_PPC_BOOK3S_64 */ + #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 295573a82c66..8330968ca346 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -288,6 +288,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u8 mce_pending_irq_work; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 2503dd4713b9..6cd4b1409874 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +void mce_irq_work_queue(void) +{ + /* Raise decrementer interrupt */ + arch_irq_work_raise(); + set_mce_pending_irq_work(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -217,7 +214,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(_ue_event_work); } @@ -239,7 +236,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this e
Re: [PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code
On 11/24/21 18:40, Nicholas Piggin wrote: Excerpts from Ganesh Goudar's message of November 24, 2021 7:55 pm: Now that we are no longer switching on the mmu in realmode mce handler, Revert the commit 4ff753feab02("powerpc/pseries: Avoid using addr_to_pfn in real mode") partially, which introduced functions mce_handle_err_virtmode/realmode() to separate mce handler code which needed translation to enabled. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 122 +++ 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 8613f9cc5798..62e1519b8355 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -static int mce_handle_err_realmode(int disposition, u8 error_type) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (disposition == RTAS_DISP_NOT_RECOVERED) { - switch (error_type) { - caseMC_ERROR_TYPE_ERAT: - flush_erat(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - caseMC_ERROR_TYPE_SLB: - /* -* Store the old slb content in paca before flushing. -* Print this when we go to virtual mode. -* There are chances that we may hit MCE again if there -* is a parity error on the SLB entry we trying to read -* for saving. Hence limit the slb saving to single -* level of recursion. -*/ - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - flush_and_reload_slb(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - default: - break; - } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - pr_err("MCE: limited recovery, system may be degraded\n"); - disposition = RTAS_DISP_FULLY_RECOVERED; - } -#endif - return disposition; -} - -static int mce_handle_err_virtmode(struct pt_regs *regs, - struct rtas_error_log *errp, - struct pseries_mc_errorlog *mce_log, - int disposition) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); int initiator = rtas_error_initiator(errp); int severity = rtas_error_severity(errp); - unsigned long eaddr = 0, paddr = 0; u8 error_type, err_sub_type; - if (!mce_log) - goto out; - - error_type = mce_log->error_type; - err_sub_type = rtas_mc_error_sub_type(mce_log); - if (initiator == RTAS_INITIATOR_UNKNOWN) mce_err.initiator = MCE_INITIATOR_UNKNOWN; else if (initiator == RTAS_INITIATOR_CPU) @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.severity = MCE_SEV_SEVERE; else if (severity == RTAS_SEVERITY_ERROR) mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; else mce_err.severity = MCE_SEV_FATAL; What's this hunk for? @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; mce_err.error_class = MCE_ECLASS_UNKNOWN; - switch (error_type) { + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (mce_log->error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; mce_common_process_ue(regs, _err); @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_
Re: [PATCH v3 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
On 11/24/21 18:33, Nicholas Piggin wrote: Excerpts from Ganesh Goudar's message of November 24, 2021 7:54 pm: In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- V2: * Use arch_irq_work_raise to raise decrementer interrupt. * Avoid having atomic variable. V3: * Fix build error. Reported by kernel test bot. --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 2 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 51 +++- arch/powerpc/kernel/time.c | 3 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 34 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 9c3c9f04129f..d22b222ba471 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,6 +99,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..6e306aaf58aa 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void machine_check_raise_dec_intr(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); +void mce_run_late_handlers(void); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..d463c796f7fa 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u32 mces_to_process; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..8e17f29472a0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +/* Raise decrementer interrupt */ +void machine_check_raise_dec_intr(void) +{ + arch_irq_work_raise(); +} It would be better if the name specifically related to irq work, which is more than just dec interrupt. It might be good to set mces_to_process here as well. Sure I would name it something like mce_irq_work_queue, and the paca variable to mce_pending_irq_work... Ok +void mce_run_late_handlers(void) +{ + if (unlikely(local_paca->mces_to_process)) { + if (ppc_md.machine_check_log_err) + ppc_md.machine_check_log_err(); + machine_check_process_queued_event(); + machine_check_ue_work(); + local_paca->mces_to_process--; + } +} The problem with a counter is that you're clearing the irq work pending in the timer interrupt, so you'll never call in here again to clear that (until something else sets irq work). But as far as I can see it does not need to be a counter, just a flag. The machine check ca
[PATCH v4] powerpc/mce: Avoid using irq_work_queue() in realmode
In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- V2: * Use arch_irq_work_raise to raise decrementer interrupt. * Avoid having atomic variable. V3: * Fix build error. Reported by kernel test bot. V4: * Rename some functions and variables * Remove mces_to_process counter and add a flag to indicate there is a mce info to process. --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 13 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 60 +--- arch/powerpc/kernel/time.c | 2 + arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +--- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 53 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 9c3c9f04129f..d22b222ba471 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,6 +99,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..c9f0936bd3c9 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void mce_irq_work_queue(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_run_irq_context_handlers(void); +#else +static inline void mce_run_irq_context_handlers(void) { }; +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void set_mce_pending_irq_work(void); +void clear_mce_pending_irq_work(void); +#endif /* CONFIG_PPC_BOOK3S_64 */ + #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..963030689cfa 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u8 mce_pending_irq_work; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..6af798803ece 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +void mce_irq_work_queue(void) +{ + /* Raise decrementer interrupt */ + arch_irq_work_raise(); + set_mce_pending_irq_work(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -217,7 +214,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(_ue_event_work); } @@ -239,7 +236,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - irq_work_queue(_ue_event_irq_work); + mce_irq_
[PATCH v3 RESEND 3/3] powerpc/mce: Modify the real address error logging messages
To avoid ambiguity, modify the strings in real address error logging messages to "foreign/control memory" from "foreign", Since the error discriptions in P9 user manual and P10 user manual are different for same type of errors. P9 User Manual for MCE: DSISR:59 Host real address to foreign space during translation. DSISR:60 Host real address to foreign space on a load or store access. P10 User Manual for MCE: DSISR:59 D-side tablewalk used a host real address in the control memory address range. DSISR:60 D-side operand access to control memory address space. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..55ccc651d1b0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -401,14 +401,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", - "Instruction fetch (foreign)", + "Instruction fetch (foreign/control memory)", "Page table walk ifetch (bad)", - "Page table walk ifetch (foreign)", + "Page table walk ifetch (foreign/control memory)", "Load (bad)", "Store (bad)", "Page table walk Load/Store (bad)", - "Page table walk Load/Store (foreign)", - "Load/Store (foreign)", + "Page table walk Load/Store (foreign/control memory)", + "Load/Store (foreign/control memory)", }; static const char *mc_link_types[] = { "Indeterminate", -- 2.31.1
[PATCH v3 RESEND 1/3] powerpc/pseries: Parse control memory access error
Add support to parse and log control memory access error for pseries. These changes are made according to PAPR v2.11 10.3.2.2.12. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 36 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 56092dccfdb8..e62a0ca2611a 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -60,11 +60,17 @@ struct pseries_mc_errorlog { * XX 2: Reserved. *XXX 3: Type of UE error. * -* For error_type != MC_ERROR_TYPE_UE +* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB * * X 1: Effective address provided. *X 5: Reserved. * XX 2: Type of SLB/ERAT/TLB error. +* +* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS +* +* X 1: Error causing address provided. +*XXX 3: Type of error. +* 4: Reserved. */ u8 sub_err_type; u8 reserved_1[6]; @@ -80,6 +86,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -90,6 +97,7 @@ struct pseries_mc_errorlog { #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 #define UE_LOGICAL_ADDR_PROVIDED 0x20 +#define MC_EFFECTIVE_ADDR_PROVIDED 0x80 #define MC_ERROR_SLB_PARITY0 #define MC_ERROR_SLB_MULTIHIT 1 @@ -103,6 +111,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE 3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: @@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_TLB: @@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_D_CACHE: @@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); + break;
[PATCH v3 RESEND 2/3] selftests/powerpc: Add test for real address error handling
Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. Signed-off-by: Ganesh Goudar --- tools/testing/selftests/powerpc/Makefile | 3 +- tools/testing/selftests/powerpc/mce/Makefile | 7 ++ .../selftests/powerpc/mce/inject-ra-err.c | 65 +++ tools/testing/selftests/powerpc/mce/vas-api.h | 1 + 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mce/Makefile create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 0830e63818c1..4830372d7416 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -31,7 +31,8 @@ SUB_DIRS = alignment \ vphn \ math \ ptrace \ - security + security \ + mce endif diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile new file mode 100644 index ..2424513982d9 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -0,0 +1,7 @@ +#SPDX-License-Identifier: GPL-2.0-or-later + +TEST_GEN_PROGS := inject-ra-err + +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c b/tools/testing/selftests/powerpc/mce/inject-ra-err.c new file mode 100644 index ..94323c34d9a6 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vas-api.h" +#include "utils.h" + +static bool faulted; + +static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v) +{ + ucontext_t *ctxt = (ucontext_t *)ctxt_v; + struct pt_regs *regs = ctxt->uc_mcontext.regs; + + faulted = true; + regs->nip += 4; +} + +static int test_ra_error(void) +{ + struct vas_tx_win_open_attr attr; + int fd, *paste_addr; + char *devname = "/dev/crypto/nx-gzip"; + struct sigaction act = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + memset(, 0, sizeof(attr)); + attr.version = 1; + attr.vas_id = 0; + + SKIP_IF(access(devname, F_OK)); + + fd = open(devname, O_RDWR); + FAIL_IF(fd < 0); + FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0); + FAIL_IF(sigaction(SIGBUS, , NULL) != 0); + + paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0ULL); + + /* The following assignment triggers exception */ + mb(); + *paste_addr = 1; + mb(); + + FAIL_IF(!faulted); + + return 0; +} + +int main(void) +{ + return test_harness(test_ra_error, "inject-ra-err"); +} + diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h b/tools/testing/selftests/powerpc/mce/vas-api.h new file mode 12 index ..1455c1bcd351 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/vas-api.h @@ -0,0 +1 @@ +../../../../../arch/powerpc/include/uapi/asm/vas-api.h \ No newline at end of file -- 2.31.1
[PATCH v3 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- V2: * Use arch_irq_work_raise to raise decrementer interrupt. * Avoid having atomic variable. V3: * Fix build error. Reported by kernel test bot. --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 2 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 51 +++- arch/powerpc/kernel/time.c | 3 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 34 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 9c3c9f04129f..d22b222ba471 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,6 +99,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..6e306aaf58aa 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void machine_check_raise_dec_intr(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); +void mce_run_late_handlers(void); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..d463c796f7fa 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u32 mces_to_process; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..8e17f29472a0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +/* Raise decrementer interrupt */ +void machine_check_raise_dec_intr(void) +{ + arch_irq_work_raise(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + local_paca->mces_to_process++; + if (!addr) return; @@ -217,7 +215,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(_ue_event_work); } @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - irq_work_queue(_ue_event_irq_work); + machine_check_raise_dec_intr(); } /* @@ -249,7 +247,6 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; -
[PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code
Now that we are no longer switching on the mmu in realmode mce handler, Revert the commit 4ff753feab02("powerpc/pseries: Avoid using addr_to_pfn in real mode") partially, which introduced functions mce_handle_err_virtmode/realmode() to separate mce handler code which needed translation to enabled. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 122 +++ 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 8613f9cc5798..62e1519b8355 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -static int mce_handle_err_realmode(int disposition, u8 error_type) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (disposition == RTAS_DISP_NOT_RECOVERED) { - switch (error_type) { - caseMC_ERROR_TYPE_ERAT: - flush_erat(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - caseMC_ERROR_TYPE_SLB: - /* -* Store the old slb content in paca before flushing. -* Print this when we go to virtual mode. -* There are chances that we may hit MCE again if there -* is a parity error on the SLB entry we trying to read -* for saving. Hence limit the slb saving to single -* level of recursion. -*/ - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - flush_and_reload_slb(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - default: - break; - } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - pr_err("MCE: limited recovery, system may be degraded\n"); - disposition = RTAS_DISP_FULLY_RECOVERED; - } -#endif - return disposition; -} - -static int mce_handle_err_virtmode(struct pt_regs *regs, - struct rtas_error_log *errp, - struct pseries_mc_errorlog *mce_log, - int disposition) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); int initiator = rtas_error_initiator(errp); int severity = rtas_error_severity(errp); - unsigned long eaddr = 0, paddr = 0; u8 error_type, err_sub_type; - if (!mce_log) - goto out; - - error_type = mce_log->error_type; - err_sub_type = rtas_mc_error_sub_type(mce_log); - if (initiator == RTAS_INITIATOR_UNKNOWN) mce_err.initiator = MCE_INITIATOR_UNKNOWN; else if (initiator == RTAS_INITIATOR_CPU) @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.severity = MCE_SEV_SEVERE; else if (severity == RTAS_SEVERITY_ERROR) mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; else mce_err.severity = MCE_SEV_FATAL; @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; mce_err.error_class = MCE_ECLASS_UNKNOWN; - switch (error_type) { + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (mce_log->error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; mce_common_process_ue(regs, _err); @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_ICACHE; + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case
[PATCH v2 2/2] pseries/mce: Refactor the pseries mce handling code
Now that we are no longer switching on the mmu in realmode mce handler, Revert the commit 4ff753feab02("powerpc/pseries: Avoid using addr_to_pfn in real mode") partially, which introduced functions mce_handle_err_virtmode/realmode() to separate mce handler code which needed translation to enabled. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 122 +++ 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 8613f9cc5798..62e1519b8355 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -static int mce_handle_err_realmode(int disposition, u8 error_type) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (disposition == RTAS_DISP_NOT_RECOVERED) { - switch (error_type) { - caseMC_ERROR_TYPE_ERAT: - flush_erat(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - caseMC_ERROR_TYPE_SLB: - /* -* Store the old slb content in paca before flushing. -* Print this when we go to virtual mode. -* There are chances that we may hit MCE again if there -* is a parity error on the SLB entry we trying to read -* for saving. Hence limit the slb saving to single -* level of recursion. -*/ - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - flush_and_reload_slb(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - default: - break; - } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - pr_err("MCE: limited recovery, system may be degraded\n"); - disposition = RTAS_DISP_FULLY_RECOVERED; - } -#endif - return disposition; -} - -static int mce_handle_err_virtmode(struct pt_regs *regs, - struct rtas_error_log *errp, - struct pseries_mc_errorlog *mce_log, - int disposition) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); int initiator = rtas_error_initiator(errp); int severity = rtas_error_severity(errp); - unsigned long eaddr = 0, paddr = 0; u8 error_type, err_sub_type; - if (!mce_log) - goto out; - - error_type = mce_log->error_type; - err_sub_type = rtas_mc_error_sub_type(mce_log); - if (initiator == RTAS_INITIATOR_UNKNOWN) mce_err.initiator = MCE_INITIATOR_UNKNOWN; else if (initiator == RTAS_INITIATOR_CPU) @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.severity = MCE_SEV_SEVERE; else if (severity == RTAS_SEVERITY_ERROR) mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; else mce_err.severity = MCE_SEV_FATAL; @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; mce_err.error_class = MCE_ECLASS_UNKNOWN; - switch (error_type) { + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (mce_log->error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; mce_common_process_ue(regs, _err); @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_ICACHE; + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case
[PATCH v2 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- V2: * Use arch_irq_work_raise to raise decrementer interrupt. * Avoid having atomic variable. --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 2 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 51 +++- arch/powerpc/kernel/time.c | 2 + arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 33 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 9c3c9f04129f..d22b222ba471 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,6 +99,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..6e306aaf58aa 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +void machine_check_raise_dec_intr(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); +void mce_run_late_handlers(void); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..d463c796f7fa 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + u32 mces_to_process; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..8e17f29472a0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +/* Raise decrementer interrupt */ +void machine_check_raise_dec_intr(void) +{ + arch_irq_work_raise(); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + local_paca->mces_to_process++; + if (!addr) return; @@ -217,7 +215,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(_ue_event_work); } @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - irq_work_queue(_ue_event_irq_work); + machine_check_raise_dec_intr(); } /* @@ -249,7 +247,6 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; - unsigned long msr; if (!get_mce_eve
Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
On 11/8/21 19:49, Nicholas Piggin wrote: Excerpts from Ganesh Goudar's message of November 8, 2021 6:38 pm: In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 2 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 51 +++- arch/powerpc/kernel/time.c | 3 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 34 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 764f2732a821..c89cc03c0f97 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -103,6 +103,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..187810f13669 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +extern void machine_check_raise_dec_intr(void); No new externs on function declarations, they tell me. ok. int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); +void mce_run_late_handlers(void); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..f49180f8c9be 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + atomic_t mces_to_process; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..45baa062ebc0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +/* Raise decrementer interrupt */ +void machine_check_raise_dec_intr(void) +{ + set_dec(1); +} The problem here is a timer can be scheduled (e.g., by an external interrupt if it gets taken before the decrementer, then uses a timer) and that set decr > 1. See logic in decrementer_set_next_event. I _think_ the way to get around this would be to have the machine check just use arch_irq_work_raise. Then you could also only call the mce handler inside the test_irq_work_pending() check and avoid the added function call on every timer. That test should also be marked unlikely come to think of it, but that's a side patchlet. Sure, I will use arch_irq_work_raise() and test_irq_work_pending(). + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + atomic_inc(_paca->mces_to_proce
Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
ount. So, consider the following sequence of events: 1. Take MCE 1. Save to queue, increment mce_queue_count, increment mces_to_process, set decrementer to fire. 2. Decrementer fires. mce_run_late_handlers is called. 3. mces_to_process = 1, so we call machine_check_log_err(), which prints (on pseries) the info for MCE 1. 4. Take MCE 2. This is saved to the queue, mce_queue_count is incremented, mces_to_process is incremented, and the decrementer is armed again. 5. We then leave the MCE interrupt context and return to the decrementer handling context. The next thing we do is we call m_c_e_process_queued_event(), which clears the entire queue (that is, MCEs 1 and 2): while (local_paca->mce_info->mce_queue_count > 0) { index = local_paca->mce_info->mce_queue_count - 1; evt = _paca->mce_info->mce_event_queue[index]; if (evt->error_type == MCE_ERROR_TYPE_UE && evt->u.ue_error.ignore_event) { local_paca->mce_info->mce_queue_count--; continue; } machine_check_print_event_info(evt, false, false); local_paca->mce_info->mce_queue_count--; } 6. We finish mce_run_late_handlers() and decrement mces_to_process, so it's now 1. 7. The decrementer fires again, mces_to_process is 1, so we start processing again. 8. We call machine_check_log_err again, it will now call the FWNMI code again and possibly print error 2. 9. process_queued_event will be called again but mce_queue_count will be 0 so it it will bail out early. I _think_ the worst that can happen - at least so long as pseries is the only implementaion of machine_check_log_err - is that we will handle MCE 2 before we query the firmware about it. That's probably benign, but I am still concerned with the overall interaction around nested interrupts. The only problem we have here is overwriting mce_data_buf in case of nested mce, and about "handle MCE 2 before we query the firmware about it" It is not possible, isn't it? Assume we take MCE 2 while we are in the middle of mce_run_late_handlers(), before the MCE handler relinquishes the CPU to timer handler, we will have everything in place, right? or am I missing something obvious. void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 934d8ae66cc6..2dc09d75d77c 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) irq_work_run(); } +#ifdef CONFIG_PPC_BOOK3S_64 + mce_run_late_handlers(); +#endif So we're now branching to a function in a different file and doing an atomic read in every timer interrupt. Is this a hot path? Is there any speed implication to doing this? Nick has suggested me to use test_irq_work_pending() and I will remove the atomic read, with v2 we may not have any serious time implications. now = get_tb(); if (now >= *next_tb) { *next_tb = ~(u64)0; @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) error_type = mce_log->error_type; disposition = mce_handle_err_realmode(disposition, error_type); - - /* -* Enable translation as we will be accessing per-cpu variables -* in save_mce_event() which may fall outside RMO region, also -* leave it enabled because subsequently we will be queuing work -* to workqueues where again per-cpu variables accessed, besides -* fwnmi_release_errinfo() crashes when called in realmode on -* pseries. -* Note: All the realmode handling like flushing SLB entries for -* SLB multihit is done by now. -*/ out: - msr = mfmsr(); - mtmsr(msr | MSR_IR | MSR_DR); - disposition = mce_handle_err_virtmode(regs, errp, mce_log, disposition); Now you are not in virtual mode/translations on when you are calling mce_handle_err_virtmode(). From the name, I thought that mce_handle_err_virtmode() would assume that you are in virtual mode? Does the function assume that? If so is it safe to call it in real mode? If not, should we rename it as part of this patch? patch 2/2, refactors this. - - /* -* Queue irq work to log this rtas event later. -* irq_work_queue uses per-cpu variables, so do this in virt -* mode as well. -*/ - irq_work_queue(_errlog_process_work); - - mtmsr(msr); - return disposition; } Thanks for the review :) . Ganesh
Re: [PATCH v3 1/3] powerpc/pseries: Parse control memory access error
On 9/6/21 14:13, Ganesh Goudar wrote: Add support to parse and log control memory access error for pseries. These changes are made according to PAPR v2.11 10.3.2.2.12. Signed-off-by: Ganesh Goudar --- v3: Modify the commit log to mention the document according to which changes are made. Define and use a macro to check if the effective address is provided. v2: No changes. --- arch/powerpc/platforms/pseries/ras.c | 36 1 file changed, 32 insertions(+), 4 deletions(-) Hi mpe, Any comments on this patch series?
[PATCH 2/2] pseries/mce: Refactor the pseries mce handling code
Now that we are no longer switching on the mmu in realmode mce handler, Revert the commit 4ff753feab02("powerpc/pseries: Avoid using addr_to_pfn in real mode") partially, which introduced functions mce_handle_err_virtmode/realmode() to separate mce handler code which needed translation to enabled. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 122 +++ 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 8613f9cc5798..62e1519b8355 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -static int mce_handle_err_realmode(int disposition, u8 error_type) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (disposition == RTAS_DISP_NOT_RECOVERED) { - switch (error_type) { - caseMC_ERROR_TYPE_ERAT: - flush_erat(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - caseMC_ERROR_TYPE_SLB: - /* -* Store the old slb content in paca before flushing. -* Print this when we go to virtual mode. -* There are chances that we may hit MCE again if there -* is a parity error on the SLB entry we trying to read -* for saving. Hence limit the slb saving to single -* level of recursion. -*/ - if (local_paca->in_mce == 1) - slb_save_contents(local_paca->mce_faulty_slbs); - flush_and_reload_slb(); - disposition = RTAS_DISP_FULLY_RECOVERED; - break; - default: - break; - } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - pr_err("MCE: limited recovery, system may be degraded\n"); - disposition = RTAS_DISP_FULLY_RECOVERED; - } -#endif - return disposition; -} - -static int mce_handle_err_virtmode(struct pt_regs *regs, - struct rtas_error_log *errp, - struct pseries_mc_errorlog *mce_log, - int disposition) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); int initiator = rtas_error_initiator(errp); int severity = rtas_error_severity(errp); - unsigned long eaddr = 0, paddr = 0; u8 error_type, err_sub_type; - if (!mce_log) - goto out; - - error_type = mce_log->error_type; - err_sub_type = rtas_mc_error_sub_type(mce_log); - if (initiator == RTAS_INITIATOR_UNKNOWN) mce_err.initiator = MCE_INITIATOR_UNKNOWN; else if (initiator == RTAS_INITIATOR_CPU) @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.severity = MCE_SEV_SEVERE; else if (severity == RTAS_SEVERITY_ERROR) mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; else mce_err.severity = MCE_SEV_FATAL; @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; mce_err.error_class = MCE_ECLASS_UNKNOWN; - switch (error_type) { + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (mce_log->error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; mce_common_process_ue(regs, _err); @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_ICACHE; + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case
[PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
In realmode mce handler we use irq_work_queue() to defer the processing of mce events, irq_work_queue() can only be called when translation is enabled because it touches memory outside RMA, hence we enable translation before calling irq_work_queue and disable on return, though it is not safe to do in realmode. To avoid this, program the decrementer and call the event processing functions from timer handler. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/mce.h | 2 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/mce.c| 51 +++- arch/powerpc/kernel/time.c | 3 ++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/ras.c | 31 +- arch/powerpc/platforms/pseries/setup.c | 1 + 8 files changed, 34 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 764f2732a821..c89cc03c0f97 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -103,6 +103,8 @@ struct machdep_calls { /* Called during machine check exception to retrive fixup address. */ bool(*mce_check_early_recovery)(struct pt_regs *regs); + void(*machine_check_log_err)(void); + /* Motherboard/chipset features. This is a kind of general purpose * hook used to control some machine specific features (like reset * lines, chip power control, etc...). diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..187810f13669 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt, unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); extern void mce_common_process_ue(struct pt_regs *regs, struct mce_error_info *mce_err); +extern void machine_check_raise_dec_intr(void); int mce_register_notifier(struct notifier_block *nb); int mce_unregister_notifier(struct notifier_block *nb); +void mce_run_late_handlers(void); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); void flush_erat(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..f49180f8c9be 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -280,6 +280,7 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_BOOK3S_64 struct mce_info *mce_info; + atomic_t mces_to_process; #endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..45baa062ebc0 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -28,19 +28,9 @@ #include "setup.h" -static void machine_check_process_queued_event(struct irq_work *work); -static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); -static struct irq_work mce_event_process_work = { -.func = machine_check_process_queued_event, -}; - -static struct irq_work mce_ue_event_irq_work = { - .func = machine_check_ue_irq_work, -}; - static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce, } } +/* Raise decrementer interrupt */ +void machine_check_raise_dec_intr(void) +{ + set_dec(1); +} + /* * Decode and save high level MCE information into per cpu buffer which * is an array of machine_check_event structure. @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, if (mce->error_type == MCE_ERROR_TYPE_UE) mce->u.ue_error.ignore_event = mce_err->ignore_event; + atomic_inc(_paca->mces_to_process); + if (!addr) return; @@ -217,7 +215,7 @@ void release_mce_event(void) get_mce_event(NULL, true); } -static void machine_check_ue_irq_work(struct irq_work *work) +static void machine_check_ue_work(void) { schedule_work(_ue_event_work); } @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt) evt, sizeof(*evt)); /* Queue work to process this event later. */ - irq_work_queue(_ue_event_irq_work); + machine_check_raise_dec_intr(); } /* @@ -249,7 +247,6 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; - unsigned long msr; if (!get_mce_event(, MCE_EVENT_RELEASE)) return; @@ -263,20 +260,7 @@ void machine_check_queue_even
Re: [PATCH v1] powerpc/64s: Fix unrecoverable MCE crash
On 9/22/21 7:32 AM, Nicholas Piggin wrote: The machine check handler is not considered NMI on 64s. The early handler is the true NMI handler, and then it schedules the machine_check_exception handler to run when interrupts are enabled. This works fine except the case of an unrecoverable MCE, where the true NMI is taken when MSR[RI] is clear, it can not recover to schedule the next handler, so it calls machine_check_exception directly so something might be done about it. Calling an async handler from NMI context can result in irq state and other things getting corrupted. This can also trigger the BUG at arch/powerpc/include/asm/interrupt.h:168. Fix this by just making the 64s machine_check_exception handler an NMI like it is on other subarchs. Signed-off-by: Nicholas Piggin --- Hi Nick, If I inject control memory access error in LPAR on top of this patch https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210906084303.183921-1-ganes...@linux.ibm.com/ I see the following warning trace WARNING: CPU: 130 PID: 7122 at arch/powerpc/include/asm/interrupt.h:319 machine_check_exception+0x310/0x340 Modules linked in: CPU: 130 PID: 7122 Comm: inj_access_err Kdump: loaded Tainted: G M 5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty #22 NIP: c002f980 LR: c002f7e8 CTR: c0a31860 REGS: c039fe51bb20 TRAP: 0700 Tainted: G M (5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty) MSR: 80029033 CR: 88000222 XER: 2004 CFAR: c002f844 IRQMASK: 0 GPR00: c002f798 c039fe51bdc0 c20d 0001 GPR04: 4002 4000 19af GPR08: 0077e5ad c077ee16c700 0080 GPR12: 88000222 c077ee16c700 GPR16: GPR20: GPR24: c20fecd8 GPR28: 0001 0001 c039fe51be80 NIP [c002f980] machine_check_exception+0x310/0x340 LR [c002f7e8] machine_check_exception+0x178/0x340 Call Trace: [c039fe51bdc0] [c002f798] machine_check_exception+0x128/0x340 (unreliable) [c039fe51be10] [c00086ec] machine_check_common+0x1ac/0x1b0 --- interrupt: 200 at 0x1968 NIP: 1968 LR: 1958 CTR: REGS: c039fe51be80 TRAP: 0200 Tainted: G M (5.15.0-rc2-cma-00054-g4a0d59fbaf71-dirty) MSR: 82a0f033 CR: 22000824 XER: CFAR: 021c DAR: 7fffb00c DSISR: 0208 IRQMASK: 0 GPR00: 22000824 7fffc9647770 10027f00 7fffb00c GPR04: GPR08: 7fffb00c 0001 GPR12: 7fffb015a330 GPR16: GPR20: GPR24: 185c GPR28: 7fffc9647d18 0001 19b0 7fffc9647770 NIP [1968] 0x1968 LR [1958] 0x1958 --- interrupt: 200
Re: [PATCH] powerpc/mce: check if event info is valid
On 8/6/21 6:53 PM, Ganesh Goudar wrote: Check if the event info is valid before printing the event information. When a fwnmi enabled nested kvm guest hits a machine check exception L0 and L2 would generate machine check event info, But L1 would not generate any machine check event info as it won't go through 0x200 vector and prints some unwanted message. To fix this, 'in_use' variable in machine check event info is no more in use, rename it to 'valid' and check if the event information is valid before logging the event information. without this patch L1 would print following message for exceptions encountered in L2, as event structure will be empty in L1. "Machine Check Exception, Unknown event version 0". Signed-off-by: Ganesh Goudar --- Hi mpe, Any comments on this patch.
Re: [PATCH v2] powerpc/mce: Fix access error in mce handler
On 9/17/21 12:09 PM, Daniel Axtens wrote: Hi Ganesh, We queue an irq work for deferred processing of mce event in realmode mce handler, where translation is disabled. Queuing of the work may result in accessing memory outside RMO region, such access needs the translation to be enabled for an LPAR running with hash mmu else the kernel crashes. After enabling translation in mce_handle_error() we used to leave it enabled to avoid crashing here, but now with the commit 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") we are restoring the MSR to disable translation. Hence to fix this enable the translation before queuing the work. [snip] Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") That patch changes arch/powerpc/powerpc/platforms/pseries/ras.c just below this comment: /* * Enable translation as we will be accessing per-cpu variables * in save_mce_event() which may fall outside RMO region, also * leave it enabled because subsequently we will be queuing work * to workqueues where again per-cpu variables accessed, besides * fwnmi_release_errinfo() crashes when called in realmode on * pseries. * Note: All the realmode handling like flushing SLB entries for * SLB multihit is done by now. */ That suggests per-cpu variables need protection. In your patch, you enable translations just around irq_work_queue: The comment is bit old, most of it doesn't make any sense now, yes per-cpu variables cannot be accessed in realmode, but with commit 923b3cf00b3f ("powerpc/mce: Remove per cpu variables from MCE handlers") we moved all of them to paca. + /* Queue irq work to process this event later. Before +* queuing the work enable translation for non radix LPAR, +* as irq_work_queue may try to access memory outside RMO +* region. +*/ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(_event_process_work); + } However, just before that in the function, there are a few things that access per-cpu variables via the local_paca, e.g.: memcpy(_paca->mce_info->mce_event_queue[index], , sizeof(evt)); Do we need to widen the window where translations are enabled in order to protect accesses to local_paca? paca will be within Real Mode Area, so it can be accessed with translate off.
[PATCH v2] powerpc/mce: Fix access error in mce handler
We queue an irq work for deferred processing of mce event in realmode mce handler, where translation is disabled. Queuing of the work may result in accessing memory outside RMO region, such access needs the translation to be enabled for an LPAR running with hash mmu else the kernel crashes. After enabling translation in mce_handle_error() we used to leave it enabled to avoid crashing here, but now with the commit 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") we are restoring the MSR to disable translation. Hence to fix this enable the translation before queuing the work. Without this change following trace is seen on injecting SLB multihit in an LPAR running with hash mmu. Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137 NIP: c0735d60 LR: c0318640 CTR: REGS: c0001ebff9a0 TRAP: 0300 Tainted: G OE (5.14.0-mce+) MSR: 80001003 CR: 28008228 XER: 0001 CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0 GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8 GPR04: c16337e8 c0027fa8fe08 0023 c16337f0 GPR08: 0023 c12ffe08 c00801460240 GPR12: c0001ec9a900 c0002ac4bd00 GPR16: 05a0 c008006b c008006b05a0 c0ff3068 GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298 GPR24: c00801490108 c1636198 c00801470090 c00801470058 GPR28: 0510 c0080100 c0080819 0019 NIP [c0735d60] llist_add_batch+0x0/0x40 LR [c0318640] __irq_work_queue_local+0x70/0xc0 Call Trace: [c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable) [c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70 [c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0 [c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4 Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") Signed-off-by: Ganesh Goudar --- v2: Change in commit message. --- arch/powerpc/kernel/mce.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..9d1e39d42e3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -249,6 +249,7 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; + unsigned long msr; if (!get_mce_event(, MCE_EVENT_RELEASE)) return; @@ -262,8 +263,19 @@ void machine_check_queue_event(void) memcpy(_paca->mce_info->mce_event_queue[index], , sizeof(evt)); - /* Queue irq work to process this event later. */ - irq_work_queue(_event_process_work); + /* Queue irq work to process this event later. Before +* queuing the work enable translation for non radix LPAR, +* as irq_work_queue may try to access memory outside RMO +* region. +*/ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(_event_process_work); + } } void mce_common_process_ue(struct pt_regs *regs, -- 2.31.1
Re: [PATCH] powerpc/mce: Fix access error in mce handler
On 9/8/21 11:10 AM, Michael Ellerman wrote: Ganesh writes: On 9/6/21 6:03 PM, Michael Ellerman wrote: Ganesh Goudar writes Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137 NIP: c0735d60 LR: c0318640 CTR: REGS: c0001ebff9a0 TRAP: 0300 Tainted: G OE (5.14.0-mce+) MSR: 80001003 CR: 28008228 XER: 0001 CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0 GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8 GPR04: c16337e8 c0027fa8fe08 0023 c16337f0 GPR08: 0023 c12ffe08 c00801460240 GPR12: c0001ec9a900 c0002ac4bd00 GPR16: 05a0 c008006b c008006b05a0 c0ff3068 GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298 GPR24: c00801490108 c1636198 c00801470090 c00801470058 GPR28: 0510 c0080100 c0080819 0019 NIP [c0735d60] llist_add_batch+0x0/0x40 LR [c0318640] __irq_work_queue_local+0x70/0xc0 Call Trace: [c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable) [c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70 [c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0 [c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4 Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") Please explain in more detail why that commit caused this breakage. After enabling translation in mce_handle_error() we used to leave it enabled to avoid crashing here, but now with this commit we are restoring the MSR to disable translation. Are you sure we left the MMU enabled to avoid crashing there, or we just left it enabled by accident? No, I think we left it enabled intentionally, I mentioned about leaving it enabled in my comment and commit message of a95a0a1654 "powerpc/pseries: Fix MCE handling on pseries". But yeah, previously the MMU was enabled when we got here whereas now it's not, because of that change. Missed to mention it in commit log, I will add it. Thanks. diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..9d1e39d42e3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -249,6 +249,7 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; + unsigned long msr; if (!get_mce_event(, MCE_EVENT_RELEASE)) return; @@ -262,8 +263,19 @@ void machine_check_queue_event(void) memcpy(_paca->mce_info->mce_event_queue[index], , sizeof(evt)); - /* Queue irq work to process this event later. */ - irq_work_queue(_event_process_work); + /* Queue irq work to process this event later. Before +* queuing the work enable translation for non radix LPAR, +* as irq_work_queue may try to access memory outside RMO +* region. +*/ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(_event_process_work); + } } We already went to virtual mode and queued (different) irq work in arch/powerpc/platforms/pseries/ras.c:mce_handle_error() We also called save_mce_event() which also might have queued irq work, via machine_check_ue_event(). So it really feels like something about the design is wrong if we have to go to virtual mode again and queue more irq work here. I guess we can probably merge this as a backportable fix, doing anything else would be a bigger change. I agree. Looking at ras.c there's the comment: * Enable translation as we will be accessing per-cpu variables * in save_mce_event() which may fall outside RMO region, also But AFAICS it's only irq_work_queue() that touches anything percpu? Yeah, we left the comment unchanged after doing some modifications around it, It needs to be updated, ill send a separate patch for it. Thanks. I see some other comments that look out of date, ie. the one above machine_check_process_queued_event() mentions syscall exit, which is no longer true. ill take care of it. There's also comments in pseries/ras.c about fwnmi_release_errinfo() crashing in real mode, but we call it in real mode now so that must be fixed? Yes, it is fixed now. So maybe we should just not be using irq_work_queue(). It's a pretty thin wrapper around set_dec(1), perhaps we just need to hand-roll some real-mode friendly way of doing that. You mean, ha
Re: [PATCH] powerpc/mce: Fix access error in mce handler
On 9/6/21 6:03 PM, Michael Ellerman wrote: Ganesh Goudar writes: We queue an irq work for deferred processing of mce event in realmode mce handler, where translation is disabled. Queuing of the work may result in accessing memory outside RMO region, such access needs the translation to be enabled for an LPAR running with hash mmu else the kernel crashes. So enable the translation before queuing the work. Without this change following trace is seen on injecting machine check error in an LPAR running with hash mmu. What type of error are you injecting? SLB multihit in kernel mode. Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137 NIP: c0735d60 LR: c0318640 CTR: REGS: c0001ebff9a0 TRAP: 0300 Tainted: G OE (5.14.0-mce+) MSR: 80001003 CR: 28008228 XER: 0001 CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0 GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8 GPR04: c16337e8 c0027fa8fe08 0023 c16337f0 GPR08: 0023 c12ffe08 c00801460240 GPR12: c0001ec9a900 c0002ac4bd00 GPR16: 05a0 c008006b c008006b05a0 c0ff3068 GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298 GPR24: c00801490108 c1636198 c00801470090 c00801470058 GPR28: 0510 c0080100 c0080819 0019 NIP [c0735d60] llist_add_batch+0x0/0x40 LR [c0318640] __irq_work_queue_local+0x70/0xc0 Call Trace: [c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable) [c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70 [c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0 [c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4 Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") Please explain in more detail why that commit caused this breakage. After enabling translation in mce_handle_error() we used to leave it enabled to avoid crashing here, but now with this commit we are restoring the MSR to disable translation. Missed to mention it in commit log, I will add it. diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..9d1e39d42e3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -249,6 +249,7 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; + unsigned long msr; if (!get_mce_event(, MCE_EVENT_RELEASE)) return; @@ -262,8 +263,19 @@ void machine_check_queue_event(void) memcpy(_paca->mce_info->mce_event_queue[index], , sizeof(evt)); - /* Queue irq work to process this event later. */ - irq_work_queue(_event_process_work); + /* Queue irq work to process this event later. Before +* queuing the work enable translation for non radix LPAR, +* as irq_work_queue may try to access memory outside RMO +* region. +*/ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(_event_process_work); + } } We already went to virtual mode and queued (different) irq work in arch/powerpc/platforms/pseries/ras.c:mce_handle_error() We also called save_mce_event() which also might have queued irq work, via machine_check_ue_event(). So it really feels like something about the design is wrong if we have to go to virtual mode again and queue more irq work here. I guess we can probably merge this as a backportable fix, doing anything else would be a bigger change. I agree. Looking at ras.c there's the comment: * Enable translation as we will be accessing per-cpu variables * in save_mce_event() which may fall outside RMO region, also But AFAICS it's only irq_work_queue() that touches anything percpu? Yeah, we left the comment unchanged after doing some modifications around it, It needs to be updated, ill send a separate patch for it. So maybe we should just not be using irq_work_queue(). It's a pretty thin wrapper around set_dec(1), perhaps we just need to hand-roll some real-mode friendly way of doing that. You mean, have separate queue and run the work from timer handler? cheers
[PATCH v3 3/3] powerpc/mce: Modify the real address error logging messages
To avoid ambiguity, modify the strings in real address error logging messages to "foreign/control memory" from "foreign", Since the error discriptions in P9 user manual and P10 user manual are different for same type of errors. P9 User Manual for MCE: DSISR:59 Host real address to foreign space during translation. DSISR:60 Host real address to foreign space on a load or store access. P10 User Manual for MCE: DSISR:59 D-side tablewalk used a host real address in the control memory address range. DSISR:60 D-side operand access to control memory address space. Signed-off-by: Ganesh Goudar --- v3: No changes. v2: No changes. --- arch/powerpc/kernel/mce.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9d1e39d42e3e..5baf69503349 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -400,14 +400,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", - "Instruction fetch (foreign)", + "Instruction fetch (foreign/control memory)", "Page table walk ifetch (bad)", - "Page table walk ifetch (foreign)", + "Page table walk ifetch (foreign/control memory)", "Load (bad)", "Store (bad)", "Page table walk Load/Store (bad)", - "Page table walk Load/Store (foreign)", - "Load/Store (foreign)", + "Page table walk Load/Store (foreign/control memory)", + "Load/Store (foreign/control memory)", }; static const char *mc_link_types[] = { "Indeterminate", -- 2.31.1
[PATCH v3 2/3] selftests/powerpc: Add test for real address error handling
Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. Signed-off-by: Ganesh Goudar --- v3: Avoid using shell script to inject error. v2: Fix build error. --- tools/testing/selftests/powerpc/Makefile | 3 +- tools/testing/selftests/powerpc/mce/Makefile | 7 ++ .../selftests/powerpc/mce/inject-ra-err.c | 65 +++ tools/testing/selftests/powerpc/mce/vas-api.h | 1 + 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mce/Makefile create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 0830e63818c1..4830372d7416 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -31,7 +31,8 @@ SUB_DIRS = alignment \ vphn \ math \ ptrace \ - security + security \ + mce endif diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile new file mode 100644 index ..2424513982d9 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -0,0 +1,7 @@ +#SPDX-License-Identifier: GPL-2.0-or-later + +TEST_GEN_PROGS := inject-ra-err + +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c b/tools/testing/selftests/powerpc/mce/inject-ra-err.c new file mode 100644 index ..94323c34d9a6 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vas-api.h" +#include "utils.h" + +static bool faulted; + +static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v) +{ + ucontext_t *ctxt = (ucontext_t *)ctxt_v; + struct pt_regs *regs = ctxt->uc_mcontext.regs; + + faulted = true; + regs->nip += 4; +} + +static int test_ra_error(void) +{ + struct vas_tx_win_open_attr attr; + int fd, *paste_addr; + char *devname = "/dev/crypto/nx-gzip"; + struct sigaction act = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + memset(, 0, sizeof(attr)); + attr.version = 1; + attr.vas_id = 0; + + SKIP_IF(access(devname, F_OK)); + + fd = open(devname, O_RDWR); + FAIL_IF(fd < 0); + FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0); + FAIL_IF(sigaction(SIGBUS, , NULL) != 0); + + paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0ULL); + + /* The following assignment triggers exception */ + mb(); + *paste_addr = 1; + mb(); + + FAIL_IF(!faulted); + + return 0; +} + +int main(void) +{ + return test_harness(test_ra_error, "inject-ra-err"); +} + diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h b/tools/testing/selftests/powerpc/mce/vas-api.h new file mode 12 index ..1455c1bcd351 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/vas-api.h @@ -0,0 +1 @@ +../../../../../arch/powerpc/include/uapi/asm/vas-api.h \ No newline at end of file -- 2.31.1
[PATCH v3 1/3] powerpc/pseries: Parse control memory access error
Add support to parse and log control memory access error for pseries. These changes are made according to PAPR v2.11 10.3.2.2.12. Signed-off-by: Ganesh Goudar --- v3: Modify the commit log to mention the document according to which changes are made. Define and use a macro to check if the effective address is provided. v2: No changes. --- arch/powerpc/platforms/pseries/ras.c | 36 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 56092dccfdb8..e62a0ca2611a 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -60,11 +60,17 @@ struct pseries_mc_errorlog { * XX 2: Reserved. *XXX 3: Type of UE error. * -* For error_type != MC_ERROR_TYPE_UE +* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB * * X 1: Effective address provided. *X 5: Reserved. * XX 2: Type of SLB/ERAT/TLB error. +* +* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS +* +* X 1: Error causing address provided. +*XXX 3: Type of error. +* 4: Reserved. */ u8 sub_err_type; u8 reserved_1[6]; @@ -80,6 +86,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -90,6 +97,7 @@ struct pseries_mc_errorlog { #define UE_EFFECTIVE_ADDR_PROVIDED 0x40 #define UE_LOGICAL_ADDR_PROVIDED 0x20 +#define MC_EFFECTIVE_ADDR_PROVIDED 0x80 #define MC_ERROR_SLB_PARITY0 #define MC_ERROR_SLB_MULTIHIT 1 @@ -103,6 +111,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE 3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: @@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_TLB: @@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; break; } - if (mce_log->sub_err_type & 0x80) + if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED) eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_D_CACHE: @@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; +
[PATCH] powerpc/mce: Fix access error in mce handler
We queue an irq work for deferred processing of mce event in realmode mce handler, where translation is disabled. Queuing of the work may result in accessing memory outside RMO region, such access needs the translation to be enabled for an LPAR running with hash mmu else the kernel crashes. So enable the translation before queuing the work. Without this change following trace is seen on injecting machine check error in an LPAR running with hash mmu. Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137 NIP: c0735d60 LR: c0318640 CTR: REGS: c0001ebff9a0 TRAP: 0300 Tainted: G OE (5.14.0-mce+) MSR: 80001003 CR: 28008228 XER: 0001 CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0 GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8 GPR04: c16337e8 c0027fa8fe08 0023 c16337f0 GPR08: 0023 c12ffe08 c00801460240 GPR12: c0001ec9a900 c0002ac4bd00 GPR16: 05a0 c008006b c008006b05a0 c0ff3068 GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298 GPR24: c00801490108 c1636198 c00801470090 c00801470058 GPR28: 0510 c0080100 c0080819 0019 NIP [c0735d60] llist_add_batch+0x0/0x40 LR [c0318640] __irq_work_queue_local+0x70/0xc0 Call Trace: [c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable) [c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70 [c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0 [c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4 Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from handler") Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..9d1e39d42e3e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -249,6 +249,7 @@ void machine_check_queue_event(void) { int index; struct machine_check_event evt; + unsigned long msr; if (!get_mce_event(, MCE_EVENT_RELEASE)) return; @@ -262,8 +263,19 @@ void machine_check_queue_event(void) memcpy(_paca->mce_info->mce_event_queue[index], , sizeof(evt)); - /* Queue irq work to process this event later. */ - irq_work_queue(_event_process_work); + /* Queue irq work to process this event later. Before +* queuing the work enable translation for non radix LPAR, +* as irq_work_queue may try to access memory outside RMO +* region. +*/ + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) { + msr = mfmsr(); + mtmsr(msr | MSR_IR | MSR_DR); + irq_work_queue(_event_process_work); + mtmsr(msr); + } else { + irq_work_queue(_event_process_work); + } } void mce_common_process_ue(struct pt_regs *regs, -- 2.31.1
Re: [PATCH v2 2/3] selftests/powerpc: Add test for real address error handling
On 8/26/21 8:57 AM, Michael Ellerman wrote: Ganesh writes: On 8/24/21 6:18 PM, Michael Ellerman wrote: Ganesh Goudar writes: Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. ... diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh new file mode 100755 index ..3633cdc651a1 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "WARN: Can't access /dev/crypto/nx-gzip, skipping" + exit 0 +fi + +timeout 5 ./inject-ra-err + +# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning. +if [ $? -ne 135 ]; then + echo "FAILED: Real address or Control memory access error not handled" + exit $? +fi + +echo "OK: Real address or Control memory access error is handled" +exit 0 I don't think we really need the shell script, we should be able to do all that in the C code. Can you try this? it works!, We need to set timeout, with 120 sec timeout we may flood the dmesg. Hmm. Does it keep faulting? The regs->nip += 4 is meant to avoid that. Yes, it keeps faulting, if we fail to handle and not send SIGBUS to the process. cheers
Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error
On 8/25/21 2:54 AM, Segher Boessenkool wrote: On Tue, Aug 24, 2021 at 04:39:57PM +1000, Michael Ellerman wrote: + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; That name is ridiculously long, but I guess that's not your fault :) We can fix it up in a later patch. It also has surprisingly little information content for the 47 chars length it has :-) What does this even mean?! It means control memory access error/real address error is detected during page table walk. Segher
Re: [PATCH v2 2/3] selftests/powerpc: Add test for real address error handling
On 8/24/21 6:18 PM, Michael Ellerman wrote: Ganesh Goudar writes: Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. ... diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh new file mode 100755 index ..3633cdc651a1 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "WARN: Can't access /dev/crypto/nx-gzip, skipping" + exit 0 +fi + +timeout 5 ./inject-ra-err + +# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning. +if [ $? -ne 135 ]; then + echo "FAILED: Real address or Control memory access error not handled" + exit $? +fi + +echo "OK: Real address or Control memory access error is handled" +exit 0 I don't think we really need the shell script, we should be able to do all that in the C code. Can you try this? it works!, We need to set timeout, with 120 sec timeout we may flood the dmesg. Thanks. cheers diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile new file mode 100644 index ..2424513982d9 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -0,0 +1,7 @@ +#SPDX-License-Identifier: GPL-2.0-or-later + +TEST_GEN_PROGS := inject-ra-err + +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c b/tools/testing/selftests/powerpc/mce/inject-ra-err.c new file mode 100644 index ..ba0f9c28f786 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vas-api.h" +#include "utils.h" + +static bool faulted; + +static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v) +{ + ucontext_t *ctxt = (ucontext_t *)ctxt_v; + struct pt_regs *regs = ctxt->uc_mcontext.regs; + + faulted = true; + regs->nip += 4; +} + +static int test_ra_error(void) +{ + struct vas_tx_win_open_attr attr; + int fd, *paste_addr; + char *devname = "/dev/crypto/nx-gzip"; + struct sigaction act = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + memset(, 0, sizeof(attr)); + attr.version = 1; + attr.vas_id = 0; + + SKIP_IF(!access(devname, F_OK)); + + fd = open(devname, O_RDWR); + FAIL_IF(fd < 0); + FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, ) < 0); + FAIL_IF(sigaction(SIGBUS, , NULL) != 0); + + paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0ULL); + + /* The following assignment triggers exception */ + mb(); + *paste_addr = 1; + mb(); + + FAIL_IF(!faulted); + + return 0; +} + +int main(void) +{ + return test_harness(test_ra_error, "inject-ra-err"); +}
Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error
On 8/24/21 12:09 PM, Michael Ellerman wrote: Hi Ganesh, Some comments below ... Ganesh Goudar writes: Add support to parse and log control memory access error for pseries. Signed-off-by: Ganesh Goudar --- v2: No changes in this patch. --- arch/powerpc/platforms/pseries/ras.c | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..608c35cad0c3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -80,6 +80,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 ... +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 Where do the above values come from? It is from latest PAPR that added support for control memory error. + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; Can you add to the comment above sub_err_type explaining what these bits are. Sure, for other errors it is explained in pseries_mc_errorlog definition, ill add it there. default: return 0; } @@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + if (mce_log->sub_err_type & 0x80) This appears many times in the file. Can we add eg. MC_EFFECTIVE_ADDR_PROVIDED? ok, thanks. + eaddr = be64_to_cpu(mce_log->effective_address); + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; That name is ridiculously long, but I guess that's not your fault :) We can fix it up in a later patch. + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + break; cheers
Re: [PATCH v2 1/3] powerpc/pseries: Parse control memory access error
Hi mpe, Any comments on this patchset? On 8/5/21 2:50 PM, Ganesh Goudar wrote: Add support to parse and log control memory access error for pseries. Signed-off-by: Ganesh Goudar --- v2: No changes in this patch. --- arch/powerpc/platforms/pseries/ras.c | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..608c35cad0c3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -80,6 +80,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -103,6 +104,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + break; case MC_ERROR_TYPE_UNKNOWN: default: mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
[PATCH] powerpc/mce: check if event info is valid
Check if the event info is valid before printing the event information. When a fwnmi enabled nested kvm guest hits a machine check exception L0 and L2 would generate machine check event info, But L1 would not generate any machine check event info as it won't go through 0x200 vector and prints some unwanted message. To fix this, 'in_use' variable in machine check event info is no more in use, rename it to 'valid' and check if the event information is valid before logging the event information. without this patch L1 would print following message for exceptions encountered in L2, as event structure will be empty in L1. "Machine Check Exception, Unknown event version 0". Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/kernel/mce.c | 7 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..3646f53f228f 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -113,7 +113,7 @@ enum MCE_LinkErrorType { struct machine_check_event { enum MCE_Versionversion:8; - u8 in_use; + u8 valid; enum MCE_Severity severity:8; enum MCE_Initiator initiator:8; enum MCE_ErrorType error_type:8; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..b778394a06b5 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -114,7 +114,7 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->srr0 = nip; mce->srr1 = regs->msr; mce->gpr3 = regs->gpr[3]; - mce->in_use = 1; + mce->valid = 1; mce->cpu = get_paca()->paca_index; /* Mark it recovered if we have handled it and MSR(RI=1). */ @@ -202,7 +202,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) if (mce) *mce = *mc_evt; if (release) - mc_evt->in_use = 0; + mc_evt->valid = 0; ret = 1; } /* Decrement the count to free the slot. */ @@ -413,6 +413,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, "Probable Software error (some chance of hardware cause)", }; + if (!evt->valid) + return; + /* Print things out */ if (evt->version != MCE_V1) { pr_err("Machine Check Exception, Unknown event version %d !\n", -- 2.31.1
[PATCH v2 3/3] powerpc/mce: Modify the real address error logging messages
To avoid ambiguity, modify the strings in real address error logging messages to "foreign/control memory" from "foreign", Since the error discriptions in P9 user manual and P10 user manual are different for same type of errors. P9 User Manual for MCE: DSISR:59 Host real address to foreign space during translation. DSISR:60 Host real address to foreign space on a load or store access. P10 User Manual for MCE: DSISR:59 D-side tablewalk used a host real address in the control memory address range. DSISR:60 D-side operand access to control memory address space. Signed-off-by: Ganesh Goudar --- v2: No changes in this patch. --- arch/powerpc/kernel/mce.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..f3ef480bb739 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -388,14 +388,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", - "Instruction fetch (foreign)", + "Instruction fetch (foreign/control memory)", "Page table walk ifetch (bad)", - "Page table walk ifetch (foreign)", + "Page table walk ifetch (foreign/control memory)", "Load (bad)", "Store (bad)", "Page table walk Load/Store (bad)", - "Page table walk Load/Store (foreign)", - "Load/Store (foreign)", + "Page table walk Load/Store (foreign/control memory)", + "Load/Store (foreign/control memory)", }; static const char *mc_link_types[] = { "Indeterminate", -- 2.31.1
[PATCH v2 2/3] selftests/powerpc: Add test for real address error handling
Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. Signed-off-by: Ganesh Goudar --- v2: Fix build error. --- tools/testing/selftests/powerpc/Makefile | 3 +- tools/testing/selftests/powerpc/mce/Makefile | 6 +++ .../selftests/powerpc/mce/inject-ra-err.c | 42 +++ .../selftests/powerpc/mce/inject-ra-err.sh| 18 tools/testing/selftests/powerpc/mce/vas-api.h | 1 + 5 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mce/Makefile create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 0830e63818c1..4830372d7416 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -31,7 +31,8 @@ SUB_DIRS = alignment \ vphn \ math \ ptrace \ - security + security \ + mce endif diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile new file mode 100644 index ..0f537ce86370 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -0,0 +1,6 @@ +#SPDX-License-Identifier: GPL-2.0-or-later + +TEST_PROGS := inject-ra-err.sh +TEST_GEN_FILES := inject-ra-err + +include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c b/tools/testing/selftests/powerpc/mce/inject-ra-err.c new file mode 100644 index ..05ab11cec3da --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vas-api.h" + +int main(void) +{ + int fd, ret; + int *paste_addr; + struct vas_tx_win_open_attr attr; + char *devname = "/dev/crypto/nx-gzip"; + + memset(, 0, sizeof(attr)); + attr.version = 1; + attr.vas_id = 0; + + fd = open(devname, O_RDWR); + if (fd < 0) { + fprintf(stderr, "Failed to open device %s\n", devname); + return -errno; + } + ret = ioctl(fd, VAS_TX_WIN_OPEN, ); + if (ret < 0) { + fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno); + ret = -errno; + goto out; + } + paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0ULL); + /* The following assignment triggers exception */ + *paste_addr = 1; + ret = 0; +out: + close(fd); + return ret; +} diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh new file mode 100755 index ..3633cdc651a1 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "WARN: Can't access /dev/crypto/nx-gzip, skipping" + exit 0 +fi + +timeout 5 ./inject-ra-err + +# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning. +if [ $? -ne 135 ]; then + echo "FAILED: Real address or Control memory access error not handled" + exit $? +fi + +echo "OK: Real address or Control memory access error is handled" +exit 0 diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h b/tools/testing/selftests/powerpc/mce/vas-api.h new file mode 12 index ..1455c1bcd351 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/vas-api.h @@ -0,0 +1 @@ +../../../../../arch/powerpc/include/uapi/asm/vas-api.h \ No newline at end of file -- 2.31.1
[PATCH v2 1/3] powerpc/pseries: Parse control memory access error
Add support to parse and log control memory access error for pseries. Signed-off-by: Ganesh Goudar --- v2: No changes in this patch. --- arch/powerpc/platforms/pseries/ras.c | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..608c35cad0c3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -80,6 +80,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -103,6 +104,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE 3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + break; case MC_ERROR_TYPE_UNKNOWN: default: mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; -- 2.31.1
[PATCH 1/3] powerpc/pseries: Parse control memory access error
Add support to parse and log control memory access error for pseries. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..608c35cad0c3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -80,6 +80,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 +#define MC_ERROR_TYPE_CTRL_MEM_ACCESS 0x08 /* RTAS pseries MCE error sub types */ #define MC_ERROR_UE_INDETERMINATE 0 @@ -103,6 +104,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_TLB_MULTIHIT 2 #define MC_ERROR_TLB_INDETERMINATE 3 +#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK 0 +#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1 + static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) { switch (mlog->error_type) { @@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) caseMC_ERROR_TYPE_ERAT: caseMC_ERROR_TYPE_TLB: return (mlog->sub_err_type & 0x03); + caseMC_ERROR_TYPE_CTRL_MEM_ACCESS: + return (mlog->sub_err_type & 0x70) >> 4; default: return 0; } @@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, case MC_ERROR_TYPE_I_CACHE: mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; + case MC_ERROR_TYPE_CTRL_MEM_ACCESS: + mce_err.error_type = MCE_ERROR_TYPE_RA; + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + switch (err_sub_type) { + case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK: + mce_err.u.ra_error_type = + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + break; + case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS: + mce_err.u.ra_error_type = + MCE_RA_ERROR_LOAD_STORE_FOREIGN; + break; + } + break; case MC_ERROR_TYPE_UNKNOWN: default: mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; -- 2.31.1
[PATCH 2/3] selftests/powerpc: Add test for real address error handling
Add test for real address or control memory address access error handling, using NX-GZIP engine. The error is injected by accessing the control memory address using illegal instruction, on successful handling the process attempting to access control memory address using illegal instruction receives SIGBUS. Signed-off-by: Ganesh Goudar --- tools/testing/selftests/powerpc/Makefile | 3 +- tools/testing/selftests/powerpc/mce/Makefile | 6 +++ .../selftests/powerpc/mce/inject-ra-err.c | 42 +++ .../selftests/powerpc/mce/inject-ra-err.sh| 19 + 4 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mce/Makefile create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 0830e63818c1..4830372d7416 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -31,7 +31,8 @@ SUB_DIRS = alignment \ vphn \ math \ ptrace \ - security + security \ + mce endif diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile new file mode 100644 index ..0f537ce86370 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -0,0 +1,6 @@ +#SPDX-License-Identifier: GPL-2.0-or-later + +TEST_PROGS := inject-ra-err.sh +TEST_GEN_FILES := inject-ra-err + +include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c b/tools/testing/selftests/powerpc/mce/inject-ra-err.c new file mode 100644 index ..58374bc92e90 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(void) +{ + int fd, ret; + int *paste_addr; + struct vas_tx_win_open_attr attr; + char *devname = "/dev/crypto/nx-gzip"; + + memset(, 0, sizeof(attr)); + attr.version = 1; + attr.vas_id = 0; + + fd = open(devname, O_RDWR); + if (fd < 0) { + fprintf(stderr, "Failed to open device %s\n", devname); + return -errno; + } + ret = ioctl(fd, VAS_TX_WIN_OPEN, ); + if (ret < 0) { + fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno); + ret = -errno; + goto out; + } + paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0ULL); + /* The following assignment triggers exception */ + *paste_addr = 1; + ret = 0; +out: + close(fd); + return ret; +} diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh new file mode 100755 index ..0e9c8ae6ad78 --- /dev/null +++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "WARN: Can't access /dev/crypto/nx-gzip, skipping" + exit 0 +fi + +# Timeout in 5 seconds, If not handled it may run indefinitely. +timeout 5 ./inject-ra-err + +# 128 + 7 (SIGBUS) = 135, 128 is a exit Code With Special Meaning. +if [ $? -ne 135 ]; then + echo "FAILED: Control memory access error not handled" + exit $? +fi + +echo "OK: Control memory access error is handled" +exit 0 -- 2.31.1
[PATCH 3/3] powerpc/mce: Modify the real address error logging messages
To avoid ambiguity, modify the strings in real address error logging messages to "foreign/control memory" from "foreign", Since the error discriptions in P9 user manual and P10 user manual are different for same type of errors. P9 User Manual for MCE: DSISR:59 Host real address to foreign space during translation. DSISR:60 Host real address to foreign space on a load or store access. P10 User Manual for MCE: DSISR:59 D-side tablewalk used a host real address in the control memory address range. DSISR:60 D-side operand access to control memory address space. Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..f3ef480bb739 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -388,14 +388,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", - "Instruction fetch (foreign)", + "Instruction fetch (foreign/control memory)", "Page table walk ifetch (bad)", - "Page table walk ifetch (foreign)", + "Page table walk ifetch (foreign/control memory)", "Load (bad)", "Store (bad)", "Page table walk Load/Store (bad)", - "Page table walk Load/Store (foreign)", - "Load/Store (foreign)", + "Page table walk Load/Store (foreign/control memory)", + "Load/Store (foreign/control memory)", }; static const char *mc_link_types[] = { "Indeterminate", -- 2.31.1
Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE
On 4/22/21 11:31 AM, Ganesh wrote: On 4/7/21 10:28 AM, Ganesh Goudar wrote: When we hit an UE while using machine check safe copy routines, ignore_event flag is set and the event is ignored by mce handler, And the flag is also saved for defered handling and printing of mce event information, But as of now saving of this flag is done on checking if the effective address is provided or physical address is calculated, which is not right. Save ignore_event flag regardless of whether the effective address is provided or physical address is calculated. Without this change following log is seen, when the event is to be ignored. [ 512.971365] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90 [ 512.971655] MCE: CPU1: Initiator CPU [ 512.971739] MCE: CPU1: Unknown [ 512.972209] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90 [ 512.972456] MCE: CPU1: Initiator CPU [ 512.972534] MCE: CPU1: Unknown Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Hi mpe, Any comments on this patch? Please ignore, I see its applied.
Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE
On 4/7/21 10:28 AM, Ganesh Goudar wrote: When we hit an UE while using machine check safe copy routines, ignore_event flag is set and the event is ignored by mce handler, And the flag is also saved for defered handling and printing of mce event information, But as of now saving of this flag is done on checking if the effective address is provided or physical address is calculated, which is not right. Save ignore_event flag regardless of whether the effective address is provided or physical address is calculated. Without this change following log is seen, when the event is to be ignored. [ 512.971365] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90 [ 512.971655] MCE: CPU1: Initiator CPU [ 512.971739] MCE: CPU1: Unknown [ 512.972209] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90 [ 512.972456] MCE: CPU1: Initiator CPU [ 512.972534] MCE: CPU1: Unknown Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Hi mpe, Any comments on this patch?
Re: [PATCH] powerpc/mce: save ignore_event flag unconditionally for UE
On 4/20/21 12:54 PM, Santosh Sivaraj wrote: Hi Ganesh, Ganesh Goudar writes: When we hit an UE while using machine check safe copy routines, ignore_event flag is set and the event is ignored by mce handler, And the flag is also saved for defered handling and printing of mce event information, But as of now saving of this flag is done on checking if the effective address is provided or physical address is calculated, which is not right. Save ignore_event flag regardless of whether the effective address is provided or physical address is calculated. Without this change following log is seen, when the event is to be ignored. [ 512.971365] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90 [ 512.971655] MCE: CPU1: Initiator CPU [ 512.971739] MCE: CPU1: Unknown [ 512.972209] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90 [ 512.972456] MCE: CPU1: Initiator CPU [ 512.972534] MCE: CPU1: Unknown Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 11f0cae086ed..db9363e131ce 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, * Populate the mce error_type and type-specific error_type. */ mce_set_error_info(mce, mce_err); + if (mce->error_type == MCE_ERROR_TYPE_UE) + mce->u.ue_error.ignore_event = mce_err->ignore_event; if (!addr) return; @@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; - mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } Small nit: Setting ignore event can happen before the phys_addr check, under the existing check for MCE_ERROR_TYPE_UE, instead of repeating the same condition again. In some cases we may not get effective address also, so it is placed before effective address check. Except for the above nit Reviewed-by: Santosh Sivaraj Thanks, Santosh -- 2.26.2
Re: [PATCH] powerpc/pseries/mce: Fix a typo in error type assignment
On 4/17/21 6:06 PM, Michael Ellerman wrote: Ganesh Goudar writes: The error type is ICACHE and DCACHE, for case MCE_ERROR_TYPE_ICACHE. Do you mean "is ICACHE not DCACHE" ? Right :), Should I send v2 ? cheers Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f8b390a9d9fb..9d4ef65da7f3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; case MC_ERROR_TYPE_UNKNOWN: default: -- 2.26.2
[PATCH] powerpc/pseries/mce: Fix a typo in error type assignment
The error type is ICACHE and DCACHE, for case MCE_ERROR_TYPE_ICACHE. Signed-off-by: Ganesh Goudar --- arch/powerpc/platforms/pseries/ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f8b390a9d9fb..9d4ef65da7f3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; case MC_ERROR_TYPE_UNKNOWN: default: -- 2.26.2
[PATCH] powerpc/mce: save ignore_event flag unconditionally for UE
When we hit an UE while using machine check safe copy routines, ignore_event flag is set and the event is ignored by mce handler, And the flag is also saved for defered handling and printing of mce event information, But as of now saving of this flag is done on checking if the effective address is provided or physical address is calculated, which is not right. Save ignore_event flag regardless of whether the effective address is provided or physical address is calculated. Without this change following log is seen, when the event is to be ignored. [ 512.971365] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90 [ 512.971655] MCE: CPU1: Initiator CPU [ 512.971739] MCE: CPU1: Unknown [ 512.972209] MCE: CPU1: machine check (Severe) UE Load/Store [Recovered] [ 512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90 [ 512.972456] MCE: CPU1: Initiator CPU [ 512.972534] MCE: CPU1: Unknown Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 11f0cae086ed..db9363e131ce 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, * Populate the mce error_type and type-specific error_type. */ mce_set_error_info(mce, mce_err); + if (mce->error_type == MCE_ERROR_TYPE_UE) + mce->u.ue_error.ignore_event = mce_err->ignore_event; if (!addr) return; @@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; - mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } -- 2.26.2
[PATCH v5 2/2] powerpc/mce: Remove per cpu variables from MCE handlers
Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info. v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid to allocate memory. v4: Spliting the patch into two. v5: Fix build error for PPC32. --- arch/powerpc/include/asm/mce.h | 18 +++ arch/powerpc/include/asm/paca.h| 4 ++ arch/powerpc/kernel/mce.c | 79 ++ arch/powerpc/kernel/setup-common.c | 2 + 4 files changed, 71 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 7d8b6679ec68..331d944280b8 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -206,6 +206,17 @@ struct mce_error_info { #define MAX_MC_EVT 10 +struct mce_info { + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +}; + /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true #define MCE_EVENT_DONTRELEASE false @@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs); long __machine_check_early_realmode_p9(struct pt_regs *regs); long __machine_check_early_realmode_p10(struct pt_regs *regs); #endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_init(void); +#else +static inline void mce_init(void) { }; +#endif /* CONFIG_PPC_BOOK3S_64 */ + #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..38e0c55e845d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -273,6 +274,9 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + struct mce_info *mce_info; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9f3e133b57b7..6ec5c68997ed 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -17,22 +17,13 @@ #include #include #include +#include #include #include #include -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); - -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); +#include "setup.h" static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); @@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(_event[index]); + int index = local_paca->mce_info->mce_nest_count++; + struct machine_check_event *mce; + mce = _paca->mce_info->mce_event[index]; /* * Return if we don't have enough space to log mce event. * mce_nest_count may go beyond MAX_MC_EVT but that's ok, @@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = local_paca->mce_info->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(_event[index]); + mc_evt = _paca->mce_info->mce_event[index]; /* Copy the event structure and release the original */
[PATCH v5 1/2] powerpc/mce: Reduce the size of event arrays
Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. This saves us ~19kB of memory and has no fatal consequences. Signed-off-by: Ganesh Goudar --- v4: This patch is a fragment of the orignal patch which is split into two. v5: No changes. --- arch/powerpc/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e6c27ae843dc..7d8b6679ec68 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,7 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true -- 2.26.2
Re: [PATCH v4 2/2] powerpc/mce: Remove per cpu variables from MCE handlers
On 1/25/21 2:54 PM, Christophe Leroy wrote: Le 22/01/2021 à 13:32, Ganesh Goudar a écrit : Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid to allocate memory. v4: Spliting the patch into two. --- arch/powerpc/include/asm/mce.h | 18 +++ arch/powerpc/include/asm/paca.h | 4 ++ arch/powerpc/kernel/mce.c | 79 ++ arch/powerpc/kernel/setup-common.c | 2 +- 4 files changed, 70 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 71f38e9248be..17dc451f0e45 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -916,7 +916,6 @@ void __init setup_arch(char **cmdline_p) /* On BookE, setup per-core TLB data structures. */ setup_tlb_core_data(); #endif - This line removal is really required for this patch ? I will correct it, Thanks for catching. /* Print various info about the machine that has been gathered so far. */ print_system_info(); @@ -938,6 +937,7 @@ void __init setup_arch(char **cmdline_p) exc_lvl_early_init(); emergency_stack_init(); + mce_init(); You have to include mce.h to avoid build failure on PPC32. Sure, thanks smp_release_cpus(); initmem_init();
[PATCH v4 2/2] powerpc/mce: Remove per cpu variables from MCE handlers
Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid to allocate memory. v4: Spliting the patch into two. --- arch/powerpc/include/asm/mce.h | 18 +++ arch/powerpc/include/asm/paca.h| 4 ++ arch/powerpc/kernel/mce.c | 79 ++ arch/powerpc/kernel/setup-common.c | 2 +- 4 files changed, 70 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 7d8b6679ec68..331d944280b8 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -206,6 +206,17 @@ struct mce_error_info { #define MAX_MC_EVT 10 +struct mce_info { + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +}; + /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true #define MCE_EVENT_DONTRELEASE false @@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs); long __machine_check_early_realmode_p9(struct pt_regs *regs); long __machine_check_early_realmode_p10(struct pt_regs *regs); #endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_init(void); +#else +static inline void mce_init(void) { }; +#endif /* CONFIG_PPC_BOOK3S_64 */ + #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..38e0c55e845d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -273,6 +274,9 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + struct mce_info *mce_info; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9f3e133b57b7..6ec5c68997ed 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -17,22 +17,13 @@ #include #include #include +#include #include #include #include -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); - -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); +#include "setup.h" static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); @@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(_event[index]); + int index = local_paca->mce_info->mce_nest_count++; + struct machine_check_event *mce; + mce = _paca->mce_info->mce_event[index]; /* * Return if we don't have enough space to log mce event. * mce_nest_count may go beyond MAX_MC_EVT but that's ok, @@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = local_paca->mce_info->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(_event[index]); + mc_evt = _paca->mce_info->mce_event[index]; /* Copy the event structure and release the original */ if (mce)
[PATCH v4 1/2] powerpc/mce: Reduce the size of event arrays
Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. This saves us ~19kB of memory and has no fatal consequences. Signed-off-by: Ganesh Goudar --- v4: This patch is a fragment of the orignal patch which is split into two. --- arch/powerpc/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e6c27ae843dc..7d8b6679ec68 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,7 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true -- 2.26.2
Re: [PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers
On 1/19/21 9:28 AM, Nicholas Piggin wrote: Excerpts from Ganesh Goudar's message of January 15, 2021 10:58 pm: Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Seems okay. Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. Could you make this a separate patch, with memory saving numbers? "Delayed" MCEs are not necessarily the same as recursive (several sequential MCEs can occur before the first event is processed). But I agree 100 is pretty overboard (as is 4 recursive MCEs really). Sure. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid to allocate memory. --- arch/powerpc/include/asm/mce.h | 21 - arch/powerpc/include/asm/paca.h| 4 ++ arch/powerpc/kernel/mce.c | 76 +- arch/powerpc/kernel/setup-common.c | 2 +- 4 files changed, 69 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e6c27ae843dc..8d6e3a7a9f37 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,18 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 + +struct mce_info { + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +}; /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true @@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs); long __machine_check_early_realmode_p8(struct pt_regs *regs); long __machine_check_early_realmode_p9(struct pt_regs *regs); long __machine_check_early_realmode_p10(struct pt_regs *regs); +#define get_mce_info() local_paca->mce_info I don't think this adds anything. Could you open code it? ok Thanks, Nick
[PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers
Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid to allocate memory. --- arch/powerpc/include/asm/mce.h | 21 - arch/powerpc/include/asm/paca.h| 4 ++ arch/powerpc/kernel/mce.c | 76 +- arch/powerpc/kernel/setup-common.c | 2 +- 4 files changed, 69 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e6c27ae843dc..8d6e3a7a9f37 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,18 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 + +struct mce_info { + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +}; /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true @@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs); long __machine_check_early_realmode_p8(struct pt_regs *regs); long __machine_check_early_realmode_p9(struct pt_regs *regs); long __machine_check_early_realmode_p10(struct pt_regs *regs); +#define get_mce_info() local_paca->mce_info +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_init(void); +#else +static inline void mce_init(void) { }; #endif /* CONFIG_PPC_BOOK3S_64 */ + #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..38e0c55e845d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -273,6 +274,9 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + struct mce_info *mce_info; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9f3e133b57b7..feeb3231b541 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -17,22 +17,13 @@ #include #include #include +#include #include #include #include -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); - -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); +#include "setup.h" static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); @@ -103,8 +94,8 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(_event[index]); + int index = get_mce_info()->mce_nest_count++; + struct machine_check_event *mce = _mce_info()->mce_event[index]; /* * Return if we don't have enough space to log mce event. @@ -191,7 +182,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = get_mce_info()->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -201,7 +192,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(_event[index]); + mc_evt = _mce_info()->mce_event[ind
[PATCH v2] powerpc/mce: Remove per cpu variables from MCE handlers
Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. Signed-off-by: Ganesh Goudar --- v2: Dynamically allocate memory for machine check event info --- arch/powerpc/include/asm/mce.h | 21 +++- arch/powerpc/include/asm/paca.h| 4 ++ arch/powerpc/kernel/mce.c | 86 ++ arch/powerpc/kernel/setup-common.c | 2 +- 4 files changed, 78 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e6c27ae843dc..8d6e3a7a9f37 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,18 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 + +struct mce_info { + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +}; /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true @@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs); long __machine_check_early_realmode_p8(struct pt_regs *regs); long __machine_check_early_realmode_p9(struct pt_regs *regs); long __machine_check_early_realmode_p10(struct pt_regs *regs); +#define get_mce_info() local_paca->mce_info +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 +void mce_init(void); +#else +static inline void mce_init(void) { }; #endif /* CONFIG_PPC_BOOK3S_64 */ + #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..38e0c55e845d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -273,6 +274,9 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + struct mce_info *mce_info; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9f3e133b57b7..14142ddbedf2 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -17,23 +17,12 @@ #include #include #include +#include #include #include #include -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); - -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); - static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); @@ -103,8 +92,8 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(_event[index]); + int index = get_mce_info()->mce_nest_count++; + struct machine_check_event *mce = _mce_info()->mce_event[index]; /* * Return if we don't have enough space to log mce event. @@ -191,7 +180,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = get_mce_info()->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -201,7 +190,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(_event[index]); + mc_evt = _mce_info()->mce_event[index]; /* Copy the event str
Re: [PATCH] powerpc/mce: Remove per cpu variables from MCE handlers
On 12/8/20 4:01 PM, Michael Ellerman wrote: Ganesh Goudar writes: diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..4769954efa7d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -273,6 +274,17 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; How much does this expand the paca by? Size of paca is 4480 bytes, these add up another 2160 bytes, so expands it by 48%.
[PATCH] powerpc/mce: Remove per cpu variables from MCE handlers
Access to per-cpu variables requires translation to be enabled on pseries machine running in hash mmu mode, Since part of MCE handler runs in realmode and part of MCE handling code is shared between ppc architectures pseries and powernv, it becomes difficult to manage these variables differently on different architectures, So have these variables in paca instead of having them as per-cpu variables to avoid complications. Maximum recursive depth of MCE is 4, Considering the maximum depth allowed reduce the size of event to 10 from 100. Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/include/asm/paca.h | 12 arch/powerpc/kernel/mce.c | 54 + 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 89aa8248a57d..feef45f2b51b 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,7 @@ struct mce_error_info { boolignore_event; }; -#define MAX_MC_EVT 100 +#define MAX_MC_EVT 10 /* Release flags for get_mce_event() */ #define MCE_EVENT_RELEASE true diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 9454d29ff4b4..4769954efa7d 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -273,6 +274,17 @@ struct paca_struct { #ifdef CONFIG_MMIOWB struct mmiowb_state mmiowb_state; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + int mce_nest_count; + struct machine_check_event mce_event[MAX_MC_EVT]; + /* Queue for delayed MCE events. */ + int mce_queue_count; + struct machine_check_event mce_event_queue[MAX_MC_EVT]; + + /* Queue for delayed MCE UE events. */ + int mce_ue_count; + struct machine_check_event mce_ue_event_queue[MAX_MC_EVT]; +#endif /* CONFIG_PPC_BOOK3S_64 */ } cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 63702c0badb9..5f53d02d6cbb 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -22,18 +22,6 @@ #include #include -static DEFINE_PER_CPU(int, mce_nest_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); - -/* Queue for delayed MCE events. */ -static DEFINE_PER_CPU(int, mce_queue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); - -/* Queue for delayed MCE UE events. */ -static DEFINE_PER_CPU(int, mce_ue_count); -static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], - mce_ue_event_queue); - static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); static void machine_check_ue_event(struct machine_check_event *evt); @@ -103,8 +91,8 @@ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr, uint64_t phys_addr) { - int index = __this_cpu_inc_return(mce_nest_count) - 1; - struct machine_check_event *mce = this_cpu_ptr(_event[index]); + int index = get_paca()->mce_nest_count++; + struct machine_check_event *mce = _paca()->mce_event[index]; /* * Return if we don't have enough space to log mce event. @@ -191,7 +179,7 @@ void save_mce_event(struct pt_regs *regs, long handled, */ int get_mce_event(struct machine_check_event *mce, bool release) { - int index = __this_cpu_read(mce_nest_count) - 1; + int index = get_paca()->mce_nest_count - 1; struct machine_check_event *mc_evt; int ret = 0; @@ -201,7 +189,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) /* Check if we have MCE info to process. */ if (index < MAX_MC_EVT) { - mc_evt = this_cpu_ptr(_event[index]); + mc_evt = _paca()->mce_event[index]; /* Copy the event structure and release the original */ if (mce) *mce = *mc_evt; @@ -211,7 +199,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) } /* Decrement the count to free the slot. */ if (release) - __this_cpu_dec(mce_nest_count); + get_paca()->mce_nest_count--; return ret; } @@ -233,13 +221,13 @@ static void machine_check_ue_event(struct machine_check_event *evt) { int index; - index = __this_cpu_inc_return(mce_ue_count) - 1; + index = get_paca()->mce_ue_count++; /* If queue is full, just return for now. */ if (index >= MAX_MC_EVT) { - __this_cpu_dec(mce_ue_count); +
[PATCH v5] lkdtm/powerpc: Add SLB multihit test
To check machine check handling, add support to inject slb multihit errors. Cc: Kees Cook Cc: Michal Suchánek Co-developed-by: Mahesh Salgaonkar Signed-off-by: Mahesh Salgaonkar Signed-off-by: Ganesh Goudar --- v5: - Insert entries at SLB_NUM_BOLTED and SLB_NUM_BOLTED +1, remove index allocating helper function. - Move mk_esid_data and mk_vsid_data helpers to asm/book3s/64/mmu-hash.h. - Use mmu_linear_psize and mmu_vmalloc_psize to get page size. - Use !radix_enabled() to check if we are in HASH mode. - And other minor improvements. v1-v4: - No major changes here for this patch, This patch was initially posted along with the other patch which got accepted. https://git.kernel.org/powerpc/c/8d0e2101274358d9b6b1f27232b40253ca48bab5 --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 28 +++- arch/powerpc/mm/book3s64/hash_utils.c | 1 + arch/powerpc/mm/book3s64/slb.c| 27 drivers/misc/lkdtm/Makefile | 1 + drivers/misc/lkdtm/core.c | 3 + drivers/misc/lkdtm/lkdtm.h| 3 + drivers/misc/lkdtm/powerpc.c | 120 ++ tools/testing/selftests/lkdtm/tests.txt | 1 + 8 files changed, 156 insertions(+), 28 deletions(-) create mode 100644 drivers/misc/lkdtm/powerpc.c diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 683a9c7d1b03..8b9f07900395 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -842,6 +842,32 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) unsigned htab_shift_for_mem_size(unsigned long mem_size); -#endif /* __ASSEMBLY__ */ +enum slb_index { + LINEAR_INDEX= 0, /* Kernel linear map (0xc000) */ + KSTACK_INDEX= 1, /* Kernel stack map */ +}; +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T) + +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, +enum slb_index index) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; +} + +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, + unsigned long flags) +{ + return (vsid << slb_vsid_shift(ssize)) | flags | + ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT); +} + +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, +unsigned long flags) +{ + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); +} + +#endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 24702c0a92e0..38076a998850 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -112,6 +112,7 @@ int mmu_linear_psize = MMU_PAGE_4K; EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); #ifdef CONFIG_SPARSEMEM_VMEMMAP int mmu_vmemmap_psize = MMU_PAGE_4K; #endif diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c30fcbfa0e32..985706acb0e5 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -28,35 +28,8 @@ #include "internal.h" -enum slb_index { - LINEAR_INDEX= 0, /* Kernel linear map (0xc000) */ - KSTACK_INDEX= 1, /* Kernel stack map */ -}; - static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); -#define slb_esid_mask(ssize) \ - (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) - -static inline unsigned long mk_esid_data(unsigned long ea, int ssize, -enum slb_index index) -{ - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; -} - -static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, -unsigned long flags) -{ - return (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); -} - -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, -unsigned long flags) -{ - return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); -} - bool stress_slb_enabled __initdata; static int __init parse_stress_slb(char *p) diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index c70b3822013f..f37ecfb0a707 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o lkdtm-$(CONFIG_LKDTM) += usercopy.o lkdtm-$(CONFIG_LKDTM)
Re: [PATCH v4 2/2] lkdtm/powerpc: Add SLB multihit test
On 10/19/20 6:45 PM, Michal Suchánek wrote: On Mon, Oct 19, 2020 at 09:59:57PM +1100, Michael Ellerman wrote: Hi Ganesh, Some comments below ... Ganesh Goudar writes: To check machine check handling, add support to inject slb multihit errors. Cc: Kees Cook Reviewed-by: Michal Suchánek Co-developed-by: Mahesh Salgaonkar Signed-off-by: Mahesh Salgaonkar Signed-off-by: Ganesh Goudar --- drivers/misc/lkdtm/Makefile | 1 + drivers/misc/lkdtm/core.c | 3 + drivers/misc/lkdtm/lkdtm.h | 3 + drivers/misc/lkdtm/powerpc.c| 156 tools/testing/selftests/lkdtm/tests.txt | 1 + 5 files changed, 164 insertions(+) create mode 100644 drivers/misc/lkdtm/powerpc.c .. diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c new file mode 100644 index ..f388b53dccba --- /dev/null +++ b/drivers/misc/lkdtm/powerpc.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "lkdtm.h" +#include +#include Usual style is to include the linux headers first and then the local header. ok + +/* Gets index for new slb entry */ +static inline unsigned long get_slb_index(void) +{ + unsigned long index; + + index = get_paca()->stab_rr; + + /* +* simple round-robin replacement of slb starting at SLB_NUM_BOLTED. +*/ + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + get_paca()->stab_rr = index; + return index; +} I'm not sure we need that really? We can just always insert at SLB_MUM_BOLTED and SLB_NUM_BOLTED + 1. Or we could allocate from the top down using mmu_slb_size - 1, and mmu_slb_size - 2. Ok, We can do that. +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T) + +/* Form the operand for slbmte */ +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, +unsigned long slot) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; +} + +#define slb_vsid_shift(ssize) \ + ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T) + +/* Form the operand for slbmte */ +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, +unsigned long flags) +{ + return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT); +} I realise it's not much code, but I'd rather those were in a header, rather than copied from slb.c. That way they can never skew vs the versions in slb.c Best place I think would be arch/powerpc/include/asm/book3s/64/mmu-hash.h Ok, ill move them. + +/* Inserts new slb entry */ It inserts two. Right. +static void insert_slb_entry(char *p, int ssize) +{ + unsigned long flags, entry; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp; That won't work if the kernel is built for 4K pages. Or at least it won't work the way we want it to. You should use mmu_linear_psize. But for vmalloc you should use mmu_vmalloc_psize, so it will need to be a parameter. Sure, Thanks + preempt_disable(); + + entry = get_slb_index(); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data((unsigned long)p, ssize, flags)), + "r" (mk_esid_data((unsigned long)p, ssize, entry)) + : "memory"); + + entry = get_slb_index(); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data((unsigned long)p, ssize, flags)), + "r" (mk_esid_data((unsigned long)p, ssize, entry)) + : "memory"); + preempt_enable(); + /* +* This triggers exception, If handled correctly we must recover +* from this error. +*/ + p[0] = '!'; That doesn't belong in here, it should be done by the caller. That would also mean p could be unsigned long in here, so you wouldn't have to cast it four times. Sure, ill change it. +} + +/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */ +static void inject_vmalloc_slb_multihit(void) +{ + char *p; + + p = vmalloc(2048); vmalloc() allocates whole pages, so it may as well be vmalloc(PAGE_SIZE). ok + if (!p) + return; That's unlikely, but it should be an error that's propagated up to the caller. ok + + insert_slb_entry(p, MMU_SEGSIZE_1T); + vfree(p); +} + +/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */ +static void inject_kmalloc_slb_multihit(void) +{ + char *p; + + p = kmalloc(2048, GFP_KERNEL); + if (!p) + return; + + insert_slb_entry(p, MMU_SEGSIZE_1T); +
Re: [PATCH v4] powerpc/pseries: Avoid using addr_to_pfn in real mode
On 7/24/20 12:09 PM, Ganesh Goudar wrote: When an UE or memory error exception is encountered the MCE handler tries to find the pfn using addr_to_pfn() which takes effective address as an argument, later pfn is used to poison the page where memory error occurred, recent rework in this area made addr_to_pfn to run in real mode, which can be fatal as it may try to access memory outside RMO region. Have two helper functions to separate things to be done in real mode and virtual mode without changing any functionality. This also fixes the following error as the use of addr_to_pfn is now moved to virtual mode. Without this change following kernel crash is seen on hitting UE. [ 485.128036] Oops: Kernel access of bad area, sig: 11 [#1] [ 485.128040] LE SMP NR_CPUS=2048 NUMA pSeries [ 485.128047] Modules linked in: [ 485.128067] CPU: 15 PID: 6536 Comm: insmod Kdump: loaded Tainted: G OE 5.7.0 #22 [ 485.128074] NIP: c009b24c LR: c00398d8 CTR: c0cd57c0 [ 485.128078] REGS: c3f1f970 TRAP: 0300 Tainted: G OE (5.7.0) [ 485.128082] MSR: 80001003 CR: 28008284 XER: 0001 [ 485.128088] CFAR: c009b190 DAR: c001fab0 DSISR: 4000 IRQMASK: 1 [ 485.128088] GPR00: 0001 c3f1fbf0 c1634300 b0fa0100 [ 485.128088] GPR04: d222 fab0 0022 [ 485.128088] GPR08: c001fab0 c001fab0 c3f1fc14 [ 485.128088] GPR12: 0008 c3ff5880 d218 [ 485.128088] GPR16: ff20 fff1 fff2 d21a1100 [ 485.128088] GPR20: d220 c0015c893c50 c0d49b28 c0015c893c50 [ 485.128088] GPR24: d21a0d08 c14e5da8 d21a0818 000a [ 485.128088] GPR28: 0008 000a c17e2970 000a [ 485.128125] NIP [c009b24c] __find_linux_pte+0x11c/0x310 [ 485.128130] LR [c00398d8] addr_to_pfn+0x138/0x170 [ 485.128133] Call Trace: [ 485.128135] Instruction dump: [ 485.128138] 3929 7d4a3378 7c883c36 7d2907b4 794a1564 7d294038 794af082 3900 [ 485.128144] 79291f24 790af00e 78e70020 7d095214 <7c69502a> 2fa3 419e011c 70690040 [ 485.128152] ---[ end trace d34b27e29ae0e340 ]--- Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common event code") Signed-off-by: Ganesh Goudar --- V2: Leave bare metal code and save_mce_event as is. V3: Have separate functions for realmode and virtual mode handling. V4: Fix build warning, rephrase commit message. --- arch/powerpc/platforms/pseries/ras.c | 118 --- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f3736fcd98fc..c509e43bac23 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -522,18 +522,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +static int mce_handle_err_realmode(int disposition, u8 error_type) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (disposition == RTAS_DISP_NOT_RECOVERED) { + switch (error_type) { + caseMC_ERROR_TYPE_SLB: + caseMC_ERROR_TYPE_ERAT: + /* +* Store the old slb content in paca before flushing. +* Print this when we go to virtual mode. +* There are chances that we may hit MCE again if there +* is a parity error on the SLB entry we trying to read +* for saving. Hence limit the slb saving to single +* level of recursion. +*/ + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); + flush_and_reload_slb(); + disposition = RTAS_DISP_FULLY_RECOVERED; + break; + default: + break; + } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + pr_err("MCE: limited recovery, system may be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; + } +#endif + return disposition; +} -static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) +static int mce_handle_err_virtmode(struct pt_regs *regs, + struct rtas_error_log *errp, + struct pseries_mc_errorlog *mce_log, + int disposition) { struct mce_error_info mce_err = { 0 }; - un
Re: [PATCH v4 0/2] powerpc/mce: Fix mce handler and add selftest
On 10/16/20 5:02 PM, Michael Ellerman wrote: On Fri, 9 Oct 2020 12:10:03 +0530, Ganesh Goudar wrote: This patch series fixes mce handling for pseries, Adds LKDTM test for SLB multihit recovery and enables selftest for the same, basically to test MCE handling on pseries/powernv machines running in hash mmu mode. v4: * Use radix_enabled() to check if its in Hash or Radix mode. * Use FW_FEATURE_LPAR instead of machine_is_pseries(). [...] Patch 1 applied to powerpc/fixes. [1/2] powerpc/mce: Avoid nmi_enter/exit in real mode on pseries hash https://git.kernel.org/powerpc/c/8d0e2101274358d9b6b1f27232b40253ca48bab5 cheers Thank you, Any comments on patch 2.
[PATCH v4 2/2] lkdtm/powerpc: Add SLB multihit test
To check machine check handling, add support to inject slb multihit errors. Cc: Kees Cook Reviewed-by: Michal Suchánek Co-developed-by: Mahesh Salgaonkar Signed-off-by: Mahesh Salgaonkar Signed-off-by: Ganesh Goudar --- drivers/misc/lkdtm/Makefile | 1 + drivers/misc/lkdtm/core.c | 3 + drivers/misc/lkdtm/lkdtm.h | 3 + drivers/misc/lkdtm/powerpc.c| 156 tools/testing/selftests/lkdtm/tests.txt | 1 + 5 files changed, 164 insertions(+) create mode 100644 drivers/misc/lkdtm/powerpc.c diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index c70b3822013f..f37ecfb0a707 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o lkdtm-$(CONFIG_LKDTM) += usercopy.o lkdtm-$(CONFIG_LKDTM) += stackleak.o lkdtm-$(CONFIG_LKDTM) += cfi.o +lkdtm-$(CONFIG_PPC64) += powerpc.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_rodata.o := n diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index a5e344df9166..8d5db42baa90 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -178,6 +178,9 @@ static const struct crashtype crashtypes[] = { #ifdef CONFIG_X86_32 CRASHTYPE(DOUBLE_FAULT), #endif +#ifdef CONFIG_PPC64 + CRASHTYPE(PPC_SLB_MULTIHIT), +#endif }; diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 8878538b2c13..b305bd511ee5 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -104,4 +104,7 @@ void lkdtm_STACKLEAK_ERASING(void); /* cfi.c */ void lkdtm_CFI_FORWARD_PROTO(void); +/* powerpc.c */ +void lkdtm_PPC_SLB_MULTIHIT(void); + #endif diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c new file mode 100644 index ..f388b53dccba --- /dev/null +++ b/drivers/misc/lkdtm/powerpc.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "lkdtm.h" +#include +#include + +/* Gets index for new slb entry */ +static inline unsigned long get_slb_index(void) +{ + unsigned long index; + + index = get_paca()->stab_rr; + + /* +* simple round-robin replacement of slb starting at SLB_NUM_BOLTED. +*/ + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + get_paca()->stab_rr = index; + return index; +} + +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T) + +/* Form the operand for slbmte */ +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, +unsigned long slot) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; +} + +#define slb_vsid_shift(ssize) \ + ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T) + +/* Form the operand for slbmte */ +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, +unsigned long flags) +{ + return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT); +} + +/* Inserts new slb entry */ +static void insert_slb_entry(char *p, int ssize) +{ + unsigned long flags, entry; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp; + preempt_disable(); + + entry = get_slb_index(); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data((unsigned long)p, ssize, flags)), + "r" (mk_esid_data((unsigned long)p, ssize, entry)) + : "memory"); + + entry = get_slb_index(); + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data((unsigned long)p, ssize, flags)), + "r" (mk_esid_data((unsigned long)p, ssize, entry)) + : "memory"); + preempt_enable(); + /* +* This triggers exception, If handled correctly we must recover +* from this error. +*/ + p[0] = '!'; +} + +/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */ +static void inject_vmalloc_slb_multihit(void) +{ + char *p; + + p = vmalloc(2048); + if (!p) + return; + + insert_slb_entry(p, MMU_SEGSIZE_1T); + vfree(p); +} + +/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */ +static void inject_kmalloc_slb_multihit(void) +{ + char *p; + + p = kmalloc(2048, GFP_KERNEL); + if (!p) + return; + + insert_slb_entry(p, MMU_SEGSIZE_1T); + kfree(p); +} + +/* + * Few initial SLB entries are bolted. Add a test to inject + * multihit in bolted entry 0. + */ +static void inse
[PATCH v4 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler
Use of nmi_enter/exit in real mode handler causes the kernel to panic and reboot on injecting slb mutihit on pseries machine running in hash mmu mode, As these calls try to accesses memory outside RMO region in real mode handler where translation is disabled. Add check to not to use these calls on pseries machine running in hash mmu mode. Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI accounting") Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index ada59f6c4298..63702c0badb9 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -591,12 +591,11 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info); long notrace machine_check_early(struct pt_regs *regs) { long handled = 0; - bool nested = in_nmi(); u8 ftrace_enabled = this_cpu_get_ftrace_enabled(); this_cpu_set_ftrace_enabled(0); - - if (!nested) + /* Do not use nmi_enter/exit for pseries hpte guest */ + if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR)) nmi_enter(); hv_nmi_check_nonrecoverable(regs); @@ -607,7 +606,7 @@ long notrace machine_check_early(struct pt_regs *regs) if (ppc_md.machine_check_early) handled = ppc_md.machine_check_early(regs); - if (!nested) + if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR)) nmi_exit(); this_cpu_set_ftrace_enabled(ftrace_enabled); -- 2.26.2
[PATCH v4 0/2] powerpc/mce: Fix mce handler and add selftest
This patch series fixes mce handling for pseries, Adds LKDTM test for SLB multihit recovery and enables selftest for the same, basically to test MCE handling on pseries/powernv machines running in hash mmu mode. v4: * Use radix_enabled() to check if its in Hash or Radix mode. * Use FW_FEATURE_LPAR instead of machine_is_pseries(). v3: * Merging selftest changes with patch 2/2, Instead of having separate patch. * Minor improvements like adding enough comments, Makefile changes, including header file and adding some prints. v2: * Remove in_nmi check before calling nmi_enter/exit, as nesting is supported. * Fix build errors and remove unused variables. * Integrate error injection code into LKDTM. * Add support to inject multihit in paca. Ganesh Goudar (2): powerpc/mce: remove nmi_enter/exit from real mode handler lkdtm/powerpc: Add SLB multihit test arch/powerpc/kernel/mce.c | 7 +- drivers/misc/lkdtm/Makefile | 1 + drivers/misc/lkdtm/core.c | 3 + drivers/misc/lkdtm/lkdtm.h | 3 + drivers/misc/lkdtm/powerpc.c| 156 tools/testing/selftests/lkdtm/tests.txt | 1 + 6 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 drivers/misc/lkdtm/powerpc.c -- 2.26.2
Re: [PATCH v3 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler
On 10/1/20 11:21 PM, Ganesh Goudar wrote: Use of nmi_enter/exit in real mode handler causes the kernel to panic and reboot on injecting slb mutihit on pseries machine running in hash mmu mode, As these calls try to accesses memory outside RMO region in real mode handler where translation is disabled. Add check to not to use these calls on pseries machine running in hash mmu mode. Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI accounting") Signed-off-by: Ganesh Goudar --- arch/powerpc/kernel/mce.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index ada59f6c4298..3bf39dd5dd43 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -591,12 +591,14 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info); long notrace machine_check_early(struct pt_regs *regs) { long handled = 0; - bool nested = in_nmi(); + bool is_pseries_hpt_guest; u8 ftrace_enabled = this_cpu_get_ftrace_enabled(); this_cpu_set_ftrace_enabled(0); - - if (!nested) + is_pseries_hpt_guest = machine_is(pseries) && + mmu_has_feature(MMU_FTR_HPTE_TABLE); + /* Do not use nmi_enter/exit for pseries hpte guest */ + if (!is_pseries_hpt_guest) In an offline discussion mpe suggested to use radix_enabled() to check if it is radix or hash, as MMU_FTR_HPTE_TABLE may be true on radix machines also and use of FW_FEATURE_LPAR better than machine_is(pseries), sending v4 with these changes. nmi_enter(); hv_nmi_check_nonrecoverable(regs); @@ -607,7 +609,7 @@ long notrace machine_check_early(struct pt_regs *regs) if (ppc_md.machine_check_early) handled = ppc_md.machine_check_early(regs); - if (!nested) + if (!is_pseries_hpt_guest) nmi_exit(); this_cpu_set_ftrace_enabled(ftrace_enabled);