Re: [PATCH] ehea: add kdump support
Thomas Klein wrote: This patch adds kdump support to the ehea driver. As the firmware doesn't free resource handles automatically, the driver has to run an as simple as possible free resource function in case of a crash shutdown. The function iterates over two arrays freeing all resource handles which are stored there. The arrays are kept up-to-date during normal runtime. The crash handler fn is triggered by the recently introduced PPC crash shutdown reg/unreg functions. Signed-off-by: Thomas Klein <[EMAIL PROTECTED]> --- drivers/net/ehea/ehea.h | 34 +- drivers/net/ehea/ehea_main.c | 281 ++ 2 files changed, 290 insertions(+), 25 deletions(-) applied -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ehea: add kdump support
This patch adds kdump support to the ehea driver. As the firmware doesn't free resource handles automatically, the driver has to run an as simple as possible free resource function in case of a crash shutdown. The function iterates over two arrays freeing all resource handles which are stored there. The arrays are kept up-to-date during normal runtime. The crash handler fn is triggered by the recently introduced PPC crash shutdown reg/unreg functions. Signed-off-by: Thomas Klein <[EMAIL PROTECTED]> --- drivers/net/ehea/ehea.h | 34 +- drivers/net/ehea/ehea_main.c | 281 ++ 2 files changed, 290 insertions(+), 25 deletions(-) diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h index 88fb53e..7c4ead3 100644 --- a/drivers/net/ehea/ehea.h +++ b/drivers/net/ehea/ehea.h @@ -40,7 +40,7 @@ #include #define DRV_NAME "ehea" -#define DRV_VERSION"EHEA_0083" +#define DRV_VERSION"EHEA_0087" /* eHEA capability flags */ #define DLPAR_PORT_ADD_REM 1 @@ -386,6 +386,13 @@ struct ehea_port_res { #define EHEA_MAX_PORTS 16 + +#define EHEA_NUM_PORTRES_FW_HANDLES6 /* QP handle, SendCQ handle, +RecvCQ handle, EQ handle, +SendMR handle, RecvMR handle */ +#define EHEA_NUM_PORT_FW_HANDLES 1 /* EQ handle */ +#define EHEA_NUM_ADAPTER_FW_HANDLES2 /* MR handle, NEQ handle */ + struct ehea_adapter { u64 handle; struct of_device *ofdev; @@ -405,6 +412,31 @@ struct ehea_mc_list { u64 macaddr; }; +/* kdump support */ +struct ehea_fw_handle_entry { + u64 adh; /* Adapter Handle */ + u64 fwh; /* Firmware Handle */ +}; + +struct ehea_fw_handle_array { + struct ehea_fw_handle_entry *arr; + int num_entries; + struct semaphore lock; +}; + +struct ehea_bcmc_reg_entry { + u64 adh; /* Adapter Handle */ + u32 port_id; /* Logical Port Id */ + u8 reg_type; /* Registration Type */ + u64 macaddr; +}; + +struct ehea_bcmc_reg_array { + struct ehea_bcmc_reg_entry *arr; + int num_entries; + struct semaphore lock; +}; + #define EHEA_PORT_UP 1 #define EHEA_PORT_DOWN 0 #define EHEA_PHY_LINK_UP 1 diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index c051c7e..21af674 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -98,8 +99,10 @@ static int port_name_cnt; static LIST_HEAD(adapter_list); u64 ehea_driver_flags; struct work_struct ehea_rereg_mr_task; - struct semaphore dlpar_mem_lock; +struct ehea_fw_handle_array ehea_fw_handles; +struct ehea_bcmc_reg_array ehea_bcmc_regs; + static int __devinit ehea_probe_adapter(struct of_device *dev, const struct of_device_id *id); @@ -132,6 +135,160 @@ void ehea_dump(void *adr, int len, char *msg) } } +static void ehea_update_firmware_handles(void) +{ + struct ehea_fw_handle_entry *arr = NULL; + struct ehea_adapter *adapter; + int num_adapters = 0; + int num_ports = 0; + int num_portres = 0; + int i = 0; + int num_fw_handles, k, l; + + /* Determine number of handles */ + list_for_each_entry(adapter, &adapter_list, list) { + num_adapters++; + + for (k = 0; k < EHEA_MAX_PORTS; k++) { + struct ehea_port *port = adapter->port[k]; + + if (!port || (port->state != EHEA_PORT_UP)) + continue; + + num_ports++; + num_portres += port->num_def_qps + port->num_add_tx_qps; + } + } + + num_fw_handles = num_adapters * EHEA_NUM_ADAPTER_FW_HANDLES + +num_ports * EHEA_NUM_PORT_FW_HANDLES + +num_portres * EHEA_NUM_PORTRES_FW_HANDLES; + + if (num_fw_handles) { + arr = kzalloc(num_fw_handles * sizeof(*arr), GFP_KERNEL); + if (!arr) + return; /* Keep the existing array */ + } else + goto out_update; + + list_for_each_entry(adapter, &adapter_list, list) { + for (k = 0; k < EHEA_MAX_PORTS; k++) { + struct ehea_port *port = adapter->port[k]; + + if (!port || (port->state != EHEA_PORT_UP)) + continue; + + for (l = 0; +l < port->num_def_qps + port->num_add_tx_qps; +l++) { + struct ehea_port_res *pr = &port->port_res[l]; + + arr[i].adh = adapter->handle; + arr[i++].fwh = pr->qp->fw_handle; +
Re: [PATCH] ehea: Add kdump support
In message <[EMAIL PROTECTED]> you wrote: > Michael Ellerman wrote on 26.11.2007 09:16:28: > > Solutions that might be better: > > > > a) if there are a finite number of handles and we can predict their > > values, just delete them all in the kdump kernel before the driver > > loads. > > Guessing the values does not work, because of the handle structure > defined by the hypervisor. > > > b) if there are a small & finite number of handles, save their values > > in a device tree property and have the kdump kernel read them and > > delete them before the driver loads. > > 5*16*nr_ports+1+1= >82. a ML16 has 4 adapters with up to 16 ports, so the > number is not small anymore I assume this machine with a huge number of adapters has a huge amount of memory too! :-) > The device tree functions are currently not exported. We can add this. > If you crashdump to a new kernel, will it get the device tree > representation of the crashed kernel or of the initial one of open > firmware? The kexec tools userspace control this. Normally it just takes the current device tree plus some modifications (eg. initrd location changes). So provided the ehea driver export this info somewhere, it can be grabbed by the kexec tools and stuffed in the device tree of the new kernel. That being said, the proper place to have this would be original device tree. > > > c) if neither of those work, provide a minimal routine that _only_ > > deletes the handles in the crashed kernel. > > I would hope this has the highest chance to actually work. > For this we would have to add a proper notifier chain. > Do you agree? > > > d) > > Firmware change? But that's not something you will get very soon. > > Christoph R. > - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ehea: Add kdump support
Hi, On Mon, Nov 26, 2007 at 01:41:37PM -0200, Luke Browning wrote: > On Mon, 2007-11-26 at 19:16 +1100, Michael Ellerman wrote: > > > For kdump we have to assume that the kernel is fundamentally broken, If I may so humbly suggest: since ehea is a power6 thing only, we should refocus our energies on "hypervisor assisted dump", which solves all of these problems. In short, upon crash, the hypervisor will reset the pci devices into working order, and will then boot a new fresh kernel into a tiny corner of ram. The rest of ram is not cleared, and can be dumped. After the dump, the mem is returned to general use. The key point here, for ehea, is "the hypervisor will reset he device state to something rational". Preliminary patches are at http://patchwork.ozlabs.org/linuxppc/patch?id=14884 and following. --linas - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ehea: Add kdump support
On Mon, 2007-11-26 at 19:16 +1100, Michael Ellerman wrote: > > Hi Thomas, > > I'm sorry, but this patch is all wrong IMHO. > > For kdump we have to assume that the kernel is fundamentally broken, > we've panicked, so something bad has happened - every line of kernel > code that is run decreases the chance that we'll successfully make it > into the kdump kernel. I agree with Michael. > Solutions that might be better: > > a) if there are a finite number of handles and we can predict their > values, just delete them all in the kdump kernel before the driver > loads. This is a good solution if handles are predefined. > b) if there are a small & finite number of handles, save their values > in a device tree property and have the kdump kernel read them and > delete them before the driver loads. Also good but is more complicated. > c) if neither of those work, provide a minimal routine that _only_ > deletes the handles in the crashed kernel. > d) Can the driver or configuration method for the driver query PHYP to determine if there are any pre-existing mappings... Regards, Luke - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ehea: Add kdump support
Michael Ellerman wrote on 26.11.2007 09:16:28: > Solutions that might be better: > > a) if there are a finite number of handles and we can predict their > values, just delete them all in the kdump kernel before the driver > loads. Guessing the values does not work, because of the handle structure defined by the hypervisor. > b) if there are a small & finite number of handles, save their values > in a device tree property and have the kdump kernel read them and > delete them before the driver loads. 5*16*nr_ports+1+1= >82. a ML16 has 4 adapters with up to 16 ports, so the number is not small anymore The device tree functions are currently not exported. If you crashdump to a new kernel, will it get the device tree representation of the crashed kernel or of the initial one of open firmware? > c) if neither of those work, provide a minimal routine that _only_ > deletes the handles in the crashed kernel. I would hope this has the highest chance to actually work. For this we would have to add a proper notifier chain. Do you agree? > d) Firmware change? But that's not something you will get very soon. Christoph R. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ehea: Add kdump support
On Fri, 2007-11-09 at 14:33 +0100, Thomas Klein wrote: > To support ehea driver reloading in a kdump kernel the driver has to perform > firmware handle deregistrations when the original kernel crashes. As there's > currently no notifier chain for machine crashes this patch enables kdump > support > in the ehea driver by bending the ppc_md.machine_crash_shutdown hook to its > own > machine crash handler. The original machine_crash_shutdown() fn is called > afterwards. This works fine as long as the ehea driver is the only one which > does so. Problems may occur if other drivers do the same and unload regularly. > This patch enables 2.6.24-rc2 to use kdump with ehea and only puts a very > low risk on base kernel. In 2.6.24 we know ehea is the only user of this > mechanism. The next step for 2.6.25 would be to add a proper notifier chain. > The full solution might be that register_reboot_notifier() provides sth > like a SYS_CRASH action. Please apply. > > Signed-off-by: Thomas Klein <[EMAIL PROTECTED]> > > --- > drivers/net/ehea/ehea.h |2 +- > drivers/net/ehea/ehea_main.c | 28 > 2 files changed, 29 insertions(+), 1 deletions(-) > Hi Thomas, I'm sorry, but this patch is all wrong IMHO. For kdump we have to assume that the kernel is fundamentally broken, we've panicked, so something bad has happened - every line of kernel code that is run decreases the chance that we'll successfully make it into the kdump kernel. So just calling unregister_driver() is no good, that's going to call lots of code, try to take lots of locks, rely on lots of data structures being uncorrupted etc. etc. And the hijacking of machine_crash_shutdown() is IMO not an acceptable solution, as you say it only works if EHEA is the only driver to do it. And as soon as EHEA does it other drivers will want to do it. What we need is the _minimal_ set of actions that can happen to make EHEA work in the kdump kernel. Solutions that might be better: a) if there are a finite number of handles and we can predict their values, just delete them all in the kdump kernel before the driver loads. b) if there are a small & finite number of handles, save their values in a device tree property and have the kdump kernel read them and delete them before the driver loads. c) if neither of those work, provide a minimal routine that _only_ deletes the handles in the crashed kernel. d) cheers -- Michael Ellerman OzLabs, IBM Australia Development Lab wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person signature.asc Description: This is a digitally signed message part
Re: [PATCH] ehea: Add kdump support
> To support ehea driver reloading in a kdump kernel the driver has to > perform firmware handle deregistrations when the original kernel > crashes. As there's currently no notifier chain for machine crashes > this patch enables kdump support in the ehea driver by bending the > ppc_md.machine_crash_shutdown hook to its own machine crash > handler. The original machine_crash_shutdown() fn is called > afterwards. This works fine as long as the ehea driver is the only one > which does so. Problems may occur if other drivers do the same and > unload regularly . This patch enables 2.6.24-rc2 to use kdump with > ehea and only puts a very low risk on base kernel. In 2.6.24 we know > ehea is the only user of this mechanism. The next step for 2.6.25 > would be to add a proper notifier chain. The full solution might be > that register_reboot_notifier() provides sth like a SYS_CRASH > action. Please apply. If we are going to do this workaround, I'd prefer the notifier chain be done correctly now. The way it's hacked in here, it's more likely to cause even more issues. Either way, if this is going to go in, it at least needs to be acked by Paulus. > > Signed-off-by: Thomas Klein <[EMAIL PROTECTED]> > > --- > drivers/net/ehea/ehea.h |2 +- > drivers/net/ehea/ehea_main.c | 28 > 2 files changed, 29 insertions(+), 1 deletions(-) > > diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h > index f78e5bf..5935899 100644 > --- a/drivers/net/ehea/ehea.h > +++ b/drivers/net/ehea/ehea.h > @@ -40,7 +40,7 @@ > #include > > #define DRV_NAME "ehea" > -#define DRV_VERSION "EHEA_0080" > +#define DRV_VERSION "EHEA_0081" > > /* eHEA capability flags */ > #define DLPAR_PORT_ADD_REM 1 > diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c > index f0319f1..40a732e 100644 > --- a/drivers/net/ehea/ehea_main.c > +++ b/drivers/net/ehea/ehea_main.c > @@ -37,6 +37,7 @@ > #include > > #include > +#include > > #include "ehea.h" > #include "ehea_qmr.h" > @@ -98,6 +99,7 @@ static int port_name_cnt = 0; > static LIST_HEAD(adapter_list); > u64 ehea_driver_flags = 0; > struct work_struct ehea_rereg_mr_task; > +static void (*orig_machine_crash_shutdown)(struct pt_regs *regs); > > struct semaphore dlpar_mem_lock; > > @@ -3312,6 +3314,29 @@ static struct notifier_block ehea_reboot_nb = { > .notifier_call = ehea_reboot_notifier, > }; > > +void ehea_crash_notifier(struct pt_regs *regs) > +{ > + ehea_info("Machine crash: freeing all eHEA resources"); > + ibmebus_unregister_driver(&ehea_driver); > + orig_machine_crash_shutdown(regs); > +} > + > +void ehea_register_crash_notifier(void) > +{ > +#ifdef CONFIG_KEXEC > + orig_machine_crash_shutdown = > + (void*)__xchg_u64((unsigned long*)&ppc_md.machine_crash_shutd own, > + (unsigned long)ehea_crash_notifier); > +#endif > +} > + > +void ehea_unregister_crash_notifier(void) > +{ > +#ifdef CONFIG_KEXEC > + ppc_md.machine_crash_shutdown = orig_machine_crash_shutdown; > +#endif > +} > + > static int check_module_parm(void) > { > int ret = 0; > @@ -3369,6 +3394,7 @@ int __init ehea_module_init(void) > goto out; > > register_reboot_notifier(&ehea_reboot_nb); > + ehea_register_crash_notifier(); > > ret = ibmebus_register_driver(&ehea_driver); > if (ret) { > @@ -3382,6 +3408,7 @@ int __init ehea_module_init(void) > ehea_error("failed to register capabilities attribute, ret=%d", > ret); > unregister_reboot_notifier(&ehea_reboot_nb); > + ehea_unregister_crash_notifier(); > ibmebus_unregister_driver(&ehea_driver); > goto out; > } > @@ -3396,6 +3423,7 @@ static void __exit ehea_module_exit(void) > driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities); > ibmebus_unregister_driver(&ehea_driver); > unregister_reboot_notifier(&ehea_reboot_nb); > + ehea_unregister_crash_notifier(); > ehea_destroy_busmap(); > } > > -- > 1.5.2 > - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ehea: Add kdump support
To support ehea driver reloading in a kdump kernel the driver has to perform firmware handle deregistrations when the original kernel crashes. As there's currently no notifier chain for machine crashes this patch enables kdump support in the ehea driver by bending the ppc_md.machine_crash_shutdown hook to its own machine crash handler. The original machine_crash_shutdown() fn is called afterwards. This works fine as long as the ehea driver is the only one which does so. Problems may occur if other drivers do the same and unload regularly. This patch enables 2.6.24-rc2 to use kdump with ehea and only puts a very low risk on base kernel. In 2.6.24 we know ehea is the only user of this mechanism. The next step for 2.6.25 would be to add a proper notifier chain. The full solution might be that register_reboot_notifier() provides sth like a SYS_CRASH action. Please apply. Signed-off-by: Thomas Klein <[EMAIL PROTECTED]> --- drivers/net/ehea/ehea.h |2 +- drivers/net/ehea/ehea_main.c | 28 2 files changed, 29 insertions(+), 1 deletions(-) diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h index f78e5bf..5935899 100644 --- a/drivers/net/ehea/ehea.h +++ b/drivers/net/ehea/ehea.h @@ -40,7 +40,7 @@ #include #define DRV_NAME "ehea" -#define DRV_VERSION"EHEA_0080" +#define DRV_VERSION"EHEA_0081" /* eHEA capability flags */ #define DLPAR_PORT_ADD_REM 1 diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index f0319f1..40a732e 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -37,6 +37,7 @@ #include #include +#include #include "ehea.h" #include "ehea_qmr.h" @@ -98,6 +99,7 @@ static int port_name_cnt = 0; static LIST_HEAD(adapter_list); u64 ehea_driver_flags = 0; struct work_struct ehea_rereg_mr_task; +static void (*orig_machine_crash_shutdown)(struct pt_regs *regs); struct semaphore dlpar_mem_lock; @@ -3312,6 +3314,29 @@ static struct notifier_block ehea_reboot_nb = { .notifier_call = ehea_reboot_notifier, }; +void ehea_crash_notifier(struct pt_regs *regs) +{ + ehea_info("Machine crash: freeing all eHEA resources"); + ibmebus_unregister_driver(&ehea_driver); + orig_machine_crash_shutdown(regs); +} + +void ehea_register_crash_notifier(void) +{ +#ifdef CONFIG_KEXEC + orig_machine_crash_shutdown = + (void*)__xchg_u64((unsigned long*)&ppc_md.machine_crash_shutdown, +(unsigned long)ehea_crash_notifier); +#endif +} + +void ehea_unregister_crash_notifier(void) +{ +#ifdef CONFIG_KEXEC + ppc_md.machine_crash_shutdown = orig_machine_crash_shutdown; +#endif +} + static int check_module_parm(void) { int ret = 0; @@ -3369,6 +3394,7 @@ int __init ehea_module_init(void) goto out; register_reboot_notifier(&ehea_reboot_nb); + ehea_register_crash_notifier(); ret = ibmebus_register_driver(&ehea_driver); if (ret) { @@ -3382,6 +3408,7 @@ int __init ehea_module_init(void) ehea_error("failed to register capabilities attribute, ret=%d", ret); unregister_reboot_notifier(&ehea_reboot_nb); + ehea_unregister_crash_notifier(); ibmebus_unregister_driver(&ehea_driver); goto out; } @@ -3396,6 +3423,7 @@ static void __exit ehea_module_exit(void) driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities); ibmebus_unregister_driver(&ehea_driver); unregister_reboot_notifier(&ehea_reboot_nb); + ehea_unregister_crash_notifier(); ehea_destroy_busmap(); } -- 1.5.2 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/