Re: [PATCH] ehea: add kdump support

2008-02-20 Thread Jeff Garzik

Thomas Klein wrote:

This patch adds kdump support to the ehea driver. As the firmware doesn't free
resource handles automatically, the driver has to run an as simple as possible
free resource function in case of a crash shutdown. The function iterates over
two arrays freeing all resource handles which are stored there. The arrays are
kept up-to-date during normal runtime. The crash handler fn is triggered by the
recently introduced PPC crash shutdown reg/unreg functions.


Signed-off-by: Thomas Klein <[EMAIL PROTECTED]>

---
 drivers/net/ehea/ehea.h  |   34 +-
 drivers/net/ehea/ehea_main.c |  281 ++
 2 files changed, 290 insertions(+), 25 deletions(-)


applied


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ehea: add kdump support

2008-02-13 Thread Thomas Klein
This patch adds kdump support to the ehea driver. As the firmware doesn't free
resource handles automatically, the driver has to run an as simple as possible
free resource function in case of a crash shutdown. The function iterates over
two arrays freeing all resource handles which are stored there. The arrays are
kept up-to-date during normal runtime. The crash handler fn is triggered by the
recently introduced PPC crash shutdown reg/unreg functions.


Signed-off-by: Thomas Klein <[EMAIL PROTECTED]>

---
 drivers/net/ehea/ehea.h  |   34 +-
 drivers/net/ehea/ehea_main.c |  281 ++
 2 files changed, 290 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 88fb53e..7c4ead3 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -40,7 +40,7 @@
 #include 

 #define DRV_NAME   "ehea"
-#define DRV_VERSION"EHEA_0083"
+#define DRV_VERSION"EHEA_0087"

 /* eHEA capability flags */
 #define DLPAR_PORT_ADD_REM 1
@@ -386,6 +386,13 @@ struct ehea_port_res {


 #define EHEA_MAX_PORTS 16
+
+#define EHEA_NUM_PORTRES_FW_HANDLES6  /* QP handle, SendCQ handle,
+RecvCQ handle, EQ handle,
+SendMR handle, RecvMR handle */
+#define EHEA_NUM_PORT_FW_HANDLES   1  /* EQ handle */
+#define EHEA_NUM_ADAPTER_FW_HANDLES2  /* MR handle, NEQ handle */
+
 struct ehea_adapter {
u64 handle;
struct of_device *ofdev;
@@ -405,6 +412,31 @@ struct ehea_mc_list {
u64 macaddr;
 };

+/* kdump support */
+struct ehea_fw_handle_entry {
+   u64 adh;   /* Adapter Handle */
+   u64 fwh;   /* Firmware Handle */
+};
+
+struct ehea_fw_handle_array {
+   struct ehea_fw_handle_entry *arr;
+   int num_entries;
+   struct semaphore lock;
+};
+
+struct ehea_bcmc_reg_entry {
+   u64 adh;   /* Adapter Handle */
+   u32 port_id;   /* Logical Port Id */
+   u8 reg_type;   /* Registration Type */
+   u64 macaddr;
+};
+
+struct ehea_bcmc_reg_array {
+   struct ehea_bcmc_reg_entry *arr;
+   int num_entries;
+   struct semaphore lock;
+};
+
 #define EHEA_PORT_UP 1
 #define EHEA_PORT_DOWN 0
 #define EHEA_PHY_LINK_UP 1
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index c051c7e..21af674 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 

@@ -98,8 +99,10 @@ static int port_name_cnt;
 static LIST_HEAD(adapter_list);
 u64 ehea_driver_flags;
 struct work_struct ehea_rereg_mr_task;
-
 struct semaphore dlpar_mem_lock;
+struct ehea_fw_handle_array ehea_fw_handles;
+struct ehea_bcmc_reg_array ehea_bcmc_regs;
+

 static int __devinit ehea_probe_adapter(struct of_device *dev,
const struct of_device_id *id);
@@ -132,6 +135,160 @@ void ehea_dump(void *adr, int len, char *msg)
}
 }

+static void ehea_update_firmware_handles(void)
+{
+   struct ehea_fw_handle_entry *arr = NULL;
+   struct ehea_adapter *adapter;
+   int num_adapters = 0;
+   int num_ports = 0;
+   int num_portres = 0;
+   int i = 0;
+   int num_fw_handles, k, l;
+
+   /* Determine number of handles */
+   list_for_each_entry(adapter, &adapter_list, list) {
+   num_adapters++;
+
+   for (k = 0; k < EHEA_MAX_PORTS; k++) {
+   struct ehea_port *port = adapter->port[k];
+
+   if (!port || (port->state != EHEA_PORT_UP))
+   continue;
+
+   num_ports++;
+   num_portres += port->num_def_qps + port->num_add_tx_qps;
+   }
+   }
+
+   num_fw_handles = num_adapters * EHEA_NUM_ADAPTER_FW_HANDLES +
+num_ports * EHEA_NUM_PORT_FW_HANDLES +
+num_portres * EHEA_NUM_PORTRES_FW_HANDLES;
+
+   if (num_fw_handles) {
+   arr = kzalloc(num_fw_handles * sizeof(*arr), GFP_KERNEL);
+   if (!arr)
+   return;  /* Keep the existing array */
+   } else
+   goto out_update;
+
+   list_for_each_entry(adapter, &adapter_list, list) {
+   for (k = 0; k < EHEA_MAX_PORTS; k++) {
+   struct ehea_port *port = adapter->port[k];
+
+   if (!port || (port->state != EHEA_PORT_UP))
+   continue;
+
+   for (l = 0;
+l < port->num_def_qps + port->num_add_tx_qps;
+l++) {
+   struct ehea_port_res *pr = &port->port_res[l];
+
+   arr[i].adh = adapter->handle;
+   arr[i++].fwh = pr->qp->fw_handle;
+  

Re: [PATCH] ehea: Add kdump support

2007-11-26 Thread Michael Neuling
In message <[EMAIL PROTECTED]> you wrote:
> Michael Ellerman wrote on 26.11.2007 09:16:28:
> > Solutions that might be better:
> >
> >  a) if there are a finite number of handles and we can predict their
> > values, just delete them all in the kdump kernel before the driver
> > loads.
> 
> Guessing the values does not work, because of the handle structure
> defined by the hypervisor.
> 
> >  b) if there are a small & finite number of handles, save their values
> > in a device tree property and have the kdump kernel read them and
> > delete them before the driver loads.
> 
> 5*16*nr_ports+1+1=   >82. a ML16 has 4 adapters with up to 16 ports, so the
> number is not small anymore

I assume this machine with a huge number of adapters has a huge amount
of memory too! :-)

> The device tree functions are currently not exported.

We can add this.

> If you crashdump to a new kernel, will it get the device tree
> representation of the crashed kernel or of the initial one of open
> firmware?

The kexec tools userspace control this.  Normally it just takes the
current device tree plus some modifications (eg. initrd location
changes).

So provided the ehea driver export this info somewhere, it can be
grabbed by the kexec tools and stuffed in the device tree of the new
kernel.  

That being said, the proper place to have this would be original device
tree.

> 
> >  c) if neither of those work, provide a minimal routine that _only_
> > deletes the handles in the crashed kernel.
> 
> I would hope this has the highest chance to actually work.
> For this we would have to add a proper notifier chain.
> Do you agree?
> 
> >  d) 
> 
> Firmware change? But that's not something you will get very soon.
> 
> Christoph R.
> 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ehea: Add kdump support

2007-11-26 Thread Linas Vepstas

Hi,

On Mon, Nov 26, 2007 at 01:41:37PM -0200, Luke Browning wrote:
> On Mon, 2007-11-26 at 19:16 +1100, Michael Ellerman wrote:
> 
> > For kdump we have to assume that the kernel is fundamentally broken,

If I may so humbly suggest: since ehea is a power6 thing only,
we should refocus our energies on "hypervisor assisted dump",
which solves all of these problems. 

In short, upon crash, the hypervisor will reset the 
pci devices into working order, and will then boot
a new fresh kernel into a tiny corner of ram. The rest
of ram is not cleared, and can be dumped. After the 
dump, the mem is returned to general use.

The key point here, for ehea, is "the hypervisor
will reset he device state to something rational".

Preliminary patches are at
http://patchwork.ozlabs.org/linuxppc/patch?id=14884
and following.

--linas
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ehea: Add kdump support

2007-11-26 Thread Luke Browning
On Mon, 2007-11-26 at 19:16 +1100, Michael Ellerman wrote:

> 
> Hi Thomas,
> 
> I'm sorry, but this patch is all wrong IMHO.
> 
> For kdump we have to assume that the kernel is fundamentally broken,
> we've panicked, so something bad has happened - every line of kernel
> code that is run decreases the chance that we'll successfully make it
> into the kdump kernel.

I agree with Michael.

> Solutions that might be better:
> 
>  a) if there are a finite number of handles and we can predict their 
> values, just delete them all in the kdump kernel before the driver
> loads.

This is a good solution if handles are predefined.

>  b) if there are a small & finite number of handles, save their values 
> in a device tree property and have the kdump kernel read them and 
> delete them before the driver loads.

Also good but is more complicated.  

>  c) if neither of those work, provide a minimal routine that _only_
> deletes the handles in the crashed kernel.
>  d) 

Can the driver or configuration method for the driver query PHYP to
determine if there are any pre-existing mappings... 

Regards, Luke

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ehea: Add kdump support

2007-11-26 Thread Christoph Raisch
Michael Ellerman wrote on 26.11.2007 09:16:28:
> Solutions that might be better:
>
>  a) if there are a finite number of handles and we can predict their
> values, just delete them all in the kdump kernel before the driver
> loads.

Guessing the values does not work, because of the handle structure
defined by the hypervisor.

>  b) if there are a small & finite number of handles, save their values
> in a device tree property and have the kdump kernel read them and
> delete them before the driver loads.

5*16*nr_ports+1+1=   >82. a ML16 has 4 adapters with up to 16 ports, so the
number
is not small anymore
The device tree functions are currently not exported.

If you crashdump to a new kernel, will it get the device tree
representation
of the crashed kernel or of the initial one of open firmware?

>  c) if neither of those work, provide a minimal routine that _only_
> deletes the handles in the crashed kernel.

I would hope this has the highest chance to actually work.
For this we would have to add a proper notifier chain.
Do you agree?

>  d) 

Firmware change? But that's not something you will get very soon.

Christoph R.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ehea: Add kdump support

2007-11-26 Thread Michael Ellerman
On Fri, 2007-11-09 at 14:33 +0100, Thomas Klein wrote:
> To support ehea driver reloading in a kdump kernel the driver has to perform
> firmware handle deregistrations when the original kernel crashes. As there's
> currently no notifier chain for machine crashes this patch enables kdump 
> support
> in the ehea driver by bending the ppc_md.machine_crash_shutdown hook to its 
> own
> machine crash handler. The original machine_crash_shutdown() fn is called
> afterwards. This works fine as long as the ehea driver is the only one which
> does so. Problems may occur if other drivers do the same and unload regularly.
> This patch enables 2.6.24-rc2 to use kdump with ehea and only puts a very
> low risk on base kernel. In 2.6.24 we know ehea is the only user of this
> mechanism. The next step for 2.6.25 would be to add a proper notifier chain.
> The full solution might be that register_reboot_notifier() provides sth
> like a SYS_CRASH action. Please apply.
> 
> Signed-off-by: Thomas Klein <[EMAIL PROTECTED]>
> 
> ---
>  drivers/net/ehea/ehea.h  |2 +-
>  drivers/net/ehea/ehea_main.c |   28 
>  2 files changed, 29 insertions(+), 1 deletions(-)
> 

Hi Thomas,

I'm sorry, but this patch is all wrong IMHO.

For kdump we have to assume that the kernel is fundamentally broken,
we've panicked, so something bad has happened - every line of kernel
code that is run decreases the chance that we'll successfully make it
into the kdump kernel.

So just calling unregister_driver() is no good, that's going to call
lots of code, try to take lots of locks, rely on lots of data structures
being uncorrupted etc. etc.

And the hijacking of machine_crash_shutdown() is IMO not an acceptable
solution, as you say it only works if EHEA is the only driver to do it.
And as soon as EHEA does it other drivers will want to do it.

What we need is the _minimal_ set of actions that can happen to make
EHEA work in the kdump kernel.

Solutions that might be better:

 a) if there are a finite number of handles and we can predict their 
values, just delete them all in the kdump kernel before the driver
loads.
 b) if there are a small & finite number of handles, save their values 
in a device tree property and have the kdump kernel read them and 
delete them before the driver loads.
 c) if neither of those work, provide a minimal routine that _only_
deletes the handles in the crashed kernel.
 d) 

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] ehea: Add kdump support

2007-11-09 Thread Michael Neuling
> To support ehea driver reloading in a kdump kernel the driver has to
> perform firmware handle deregistrations when the original kernel
> crashes. As there's currently no notifier chain for machine crashes
> this patch enables kdump support in the ehea driver by bending the
> ppc_md.machine_crash_shutdown hook to its own machine crash
> handler. The original machine_crash_shutdown() fn is called
> afterwards. This works fine as long as the ehea driver is the only one
> which does so. Problems may occur if other drivers do the same and
> unload regularly .  This patch enables 2.6.24-rc2 to use kdump with
> ehea and only puts a very low risk on base kernel. In 2.6.24 we know
> ehea is the only user of this mechanism. The next step for 2.6.25
> would be to add a proper notifier chain.  The full solution might be
> that register_reboot_notifier() provides sth like a SYS_CRASH
> action. Please apply.

If we are going to do this workaround, I'd prefer the notifier chain be
done correctly now.  The way it's hacked in here, it's more likely to
cause even more issues.  

Either way, if this is going to go in, it at least needs to be acked by
Paulus.

> 
> Signed-off-by: Thomas Klein <[EMAIL PROTECTED]>
> 
> ---
>  drivers/net/ehea/ehea.h  |2 +-
>  drivers/net/ehea/ehea_main.c |   28 
>  2 files changed, 29 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
> index f78e5bf..5935899 100644
> --- a/drivers/net/ehea/ehea.h
> +++ b/drivers/net/ehea/ehea.h
> @@ -40,7 +40,7 @@
>  #include 
>  
>  #define DRV_NAME "ehea"
> -#define DRV_VERSION  "EHEA_0080"
> +#define DRV_VERSION  "EHEA_0081"
>  
>  /* eHEA capability flags */
>  #define DLPAR_PORT_ADD_REM 1
> diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
> index f0319f1..40a732e 100644
> --- a/drivers/net/ehea/ehea_main.c
> +++ b/drivers/net/ehea/ehea_main.c
> @@ -37,6 +37,7 @@
>  #include 
>  
>  #include 
> +#include 
>  
>  #include "ehea.h"
>  #include "ehea_qmr.h"
> @@ -98,6 +99,7 @@ static int port_name_cnt = 0;
>  static LIST_HEAD(adapter_list);
>  u64 ehea_driver_flags = 0;
>  struct work_struct ehea_rereg_mr_task;
> +static void (*orig_machine_crash_shutdown)(struct pt_regs *regs);
>  
>  struct semaphore dlpar_mem_lock;
>  
> @@ -3312,6 +3314,29 @@ static struct notifier_block ehea_reboot_nb = {
>  .notifier_call = ehea_reboot_notifier,
>  };
>  
> +void ehea_crash_notifier(struct pt_regs *regs)
> +{
> + ehea_info("Machine crash: freeing all eHEA resources");
> + ibmebus_unregister_driver(&ehea_driver);
> + orig_machine_crash_shutdown(regs);
> +}
> +
> +void ehea_register_crash_notifier(void)
> +{
> +#ifdef CONFIG_KEXEC
> + orig_machine_crash_shutdown =
> +   (void*)__xchg_u64((unsigned long*)&ppc_md.machine_crash_shutd
own,
> +  (unsigned long)ehea_crash_notifier);
> +#endif
> +}
> +
> +void ehea_unregister_crash_notifier(void)
> +{
> +#ifdef CONFIG_KEXEC
> + ppc_md.machine_crash_shutdown = orig_machine_crash_shutdown;
> +#endif
> +}
> +
>  static int check_module_parm(void)
>  {
>   int ret = 0;
> @@ -3369,6 +3394,7 @@ int __init ehea_module_init(void)
>   goto out;
>  
>   register_reboot_notifier(&ehea_reboot_nb);
> + ehea_register_crash_notifier();
>  
>   ret = ibmebus_register_driver(&ehea_driver);
>   if (ret) {
> @@ -3382,6 +3408,7 @@ int __init ehea_module_init(void)
>   ehea_error("failed to register capabilities attribute, ret=%d",
>  ret);
>   unregister_reboot_notifier(&ehea_reboot_nb);
> + ehea_unregister_crash_notifier();
>   ibmebus_unregister_driver(&ehea_driver);
>   goto out;
>   }
> @@ -3396,6 +3423,7 @@ static void __exit ehea_module_exit(void)
>   driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities);
>   ibmebus_unregister_driver(&ehea_driver);
>   unregister_reboot_notifier(&ehea_reboot_nb);
> + ehea_unregister_crash_notifier();
>   ehea_destroy_busmap();
>  }
>  
> -- 
> 1.5.2
> 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ehea: Add kdump support

2007-11-09 Thread Thomas Klein
To support ehea driver reloading in a kdump kernel the driver has to perform
firmware handle deregistrations when the original kernel crashes. As there's
currently no notifier chain for machine crashes this patch enables kdump support
in the ehea driver by bending the ppc_md.machine_crash_shutdown hook to its own
machine crash handler. The original machine_crash_shutdown() fn is called
afterwards. This works fine as long as the ehea driver is the only one which
does so. Problems may occur if other drivers do the same and unload regularly.
This patch enables 2.6.24-rc2 to use kdump with ehea and only puts a very
low risk on base kernel. In 2.6.24 we know ehea is the only user of this
mechanism. The next step for 2.6.25 would be to add a proper notifier chain.
The full solution might be that register_reboot_notifier() provides sth
like a SYS_CRASH action. Please apply.

Signed-off-by: Thomas Klein <[EMAIL PROTECTED]>

---
 drivers/net/ehea/ehea.h  |2 +-
 drivers/net/ehea/ehea_main.c |   28 
 2 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index f78e5bf..5935899 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -40,7 +40,7 @@
 #include 
 
 #define DRV_NAME   "ehea"
-#define DRV_VERSION"EHEA_0080"
+#define DRV_VERSION"EHEA_0081"
 
 /* eHEA capability flags */
 #define DLPAR_PORT_ADD_REM 1
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index f0319f1..40a732e 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -37,6 +37,7 @@
 #include 
 
 #include 
+#include 
 
 #include "ehea.h"
 #include "ehea_qmr.h"
@@ -98,6 +99,7 @@ static int port_name_cnt = 0;
 static LIST_HEAD(adapter_list);
 u64 ehea_driver_flags = 0;
 struct work_struct ehea_rereg_mr_task;
+static void (*orig_machine_crash_shutdown)(struct pt_regs *regs);
 
 struct semaphore dlpar_mem_lock;
 
@@ -3312,6 +3314,29 @@ static struct notifier_block ehea_reboot_nb = {
 .notifier_call = ehea_reboot_notifier,
 };
 
+void ehea_crash_notifier(struct pt_regs *regs)
+{
+   ehea_info("Machine crash: freeing all eHEA resources");
+   ibmebus_unregister_driver(&ehea_driver);
+   orig_machine_crash_shutdown(regs);
+}
+
+void ehea_register_crash_notifier(void)
+{
+#ifdef CONFIG_KEXEC
+   orig_machine_crash_shutdown =
+   (void*)__xchg_u64((unsigned 
long*)&ppc_md.machine_crash_shutdown,
+(unsigned long)ehea_crash_notifier);
+#endif
+}
+
+void ehea_unregister_crash_notifier(void)
+{
+#ifdef CONFIG_KEXEC
+   ppc_md.machine_crash_shutdown = orig_machine_crash_shutdown;
+#endif
+}
+
 static int check_module_parm(void)
 {
int ret = 0;
@@ -3369,6 +3394,7 @@ int __init ehea_module_init(void)
goto out;
 
register_reboot_notifier(&ehea_reboot_nb);
+   ehea_register_crash_notifier();
 
ret = ibmebus_register_driver(&ehea_driver);
if (ret) {
@@ -3382,6 +3408,7 @@ int __init ehea_module_init(void)
ehea_error("failed to register capabilities attribute, ret=%d",
   ret);
unregister_reboot_notifier(&ehea_reboot_nb);
+   ehea_unregister_crash_notifier();
ibmebus_unregister_driver(&ehea_driver);
goto out;
}
@@ -3396,6 +3423,7 @@ static void __exit ehea_module_exit(void)
driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities);
ibmebus_unregister_driver(&ehea_driver);
unregister_reboot_notifier(&ehea_reboot_nb);
+   ehea_unregister_crash_notifier();
ehea_destroy_busmap();
 }
 
-- 
1.5.2
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/