Hi Linas- I like the idea of splitting the recovery stuff into its own driver. A few comments on the last reorg patch:
> Index: linux-2.6.13-rc6-git9/arch/ppc64/kernel/eeh.c ... > +static int > +eeh_slot_availability(struct device_node *dn) ... > +void eeh_restore_bars(struct device_node *dn) Inconsistent spacing in new code... </nit> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ linux-2.6.13-rc6-git9/arch/ppc64/kernel/eeh_driver.c 2005-08-23 > 14:34:44.000000000 -0500 > @@ -0,0 +1,361 @@ > +/* > + * PCI Hot Plug Controller Driver for RPA-compliant PPC64 platform. This probably isn't the right header description for this file :) > + > +/** > + * rpaphp_find_slot - find and return the slot holding the device > + * @dev: pci device for which we want the slot structure. > + */ > +static struct slot *rpaphp_find_slot(struct pci_dev *dev) > +{ > + struct list_head *tmp, *n; > + struct slot *slot; > + > + list_for_each_safe(tmp, n, &rpaphp_slot_head) { > + struct pci_bus *bus; > + > + slot = list_entry(tmp, struct slot, rpaphp_slot_list); > + > + /* PHB's don't have bridges. */ > + bus = slot->bus; > + if (bus == NULL) > + continue; > + > + /* The PCI device could be the slot itself. */ > + if (bus->self == dev) > + return slot; > + > + if (pci_search_bus_for_dev (bus, dev)) > + return slot; > + } > + return NULL; > +} This function breaks if rpaphp is compiled as a module. It's probably bad for kernel code to depend on symbols exported from modules. This raises two larger questions: Where should this new driver sit, and should it be possible to compile it as a module as well? > +/* ------------------------------------------------------- */ > +/** > + * handle_eeh_events -- reset a PCI device after hard lockup. > + * ... > +int eeh_reset_device (struct pci_dev *dev, struct device_node *dn, int > reconfig) Header doesn't match the function :) > +{ > + struct hotplug_slot *frozen_slot= NULL; > + struct hotplug_slot_ops *frozen_ops= NULL; > + > + if (!dev) > + return 1; > + > + if (reconfig) { > + frozen_slot = pci_hp_find_slot(dev); > + if (frozen_slot) > + frozen_ops = frozen_slot->ops; > + } > + > + if (frozen_ops) frozen_ops->disable_slot (frozen_slot); > + > + /* Reset the pci controller. (Asserts RST#; resets config space). > + * Reconfigure bridges and devices */ > + rtas_set_slot_reset (dn->child); > + > + /* Walk over all functions on this device */ > + struct device_node *peer = dn->child; > + while (peer) { > + rtas_configure_bridge(peer); > + eeh_restore_bars(peer); > + peer = peer->sibling; > + } > + > + /* Give the system 5 seconds to finish running the user-space > + * hotplug scripts, e.g. ifdown for ethernet. Yes, this is a hack, > + * but if we don't do this, weird things happen. > + */ > + if (frozen_ops) { > + ssleep (5); > + frozen_ops->enable_slot (frozen_slot); > + } > + return 0; > +} This dependence on struct hotplug_slot might be problematic as we restrict the registration of "PCI hotplug" slots to exclude PHBs, VIO, and embedded slots. Noticed your comment to this effect. I can work with you offline on this. > + > +/* The longest amount of time to wait for a pci device > + * to come back on line, in seconds. > + */ > +#define MAX_WAIT_FOR_RECOVERY 15 > + > +int handle_eeh_events (struct eeh_event *event) > +{ > + ... > + frozen_device = pci_bus_to_OF_node(dev->bus); > + if (!frozen_device) > + { > + printk (KERN_ERR "EEH: Cannot find PCI controller for %s\n", > + pci_name(dev)); > + > + return 1; > + } > + BUG_ON (frozen_device->phb==NULL); > + > + /* We get "permanent failure" messages on empty slots. > + * These are false alarms. Empty slots have no child dn. */ > + if ((event->state == pci_channel_io_perm_failure) && (frozen_device == > NULL)) The second part of this conditional will never be true, as this has just been checked above. > + if (frozen_device) > + freeze_count = frozen_device->eeh_freeze_count; This conditional will always be true, as this has also ben checked above. > Index: linux-2.6.13-rc6-git9/include/asm-ppc64/prom.h > =================================================================== > --- linux-2.6.13-rc6-git9.orig/include/asm-ppc64/prom.h 2005-08-19 > 15:11:39.000000000 -0500 > +++ linux-2.6.13-rc6-git9/include/asm-ppc64/prom.h 2005-08-23 > 13:31:52.000000000 -0500 > @@ -135,11 +135,17 @@ > int busno; /* for pci devices */ > int bussubno; /* for pci devices */ > int devfn; /* for pci devices */ > - int eeh_mode; /* See eeh.h for possible EEH_MODEs */ > - int eeh_config_addr; > + int eeh_mode; /* See eeh.h for possible EEH_MODEs */ > + int eeh_config_addr; > + int eeh_check_count; /* number of times device driver ignored > error */ > + int eeh_freeze_count; /* number of times this device froze up. */ > + int eeh_is_bridge; /* device is pci-to-pci bridge */ > + > int pci_ext_config_space; /* for pci devices */ > struct pci_controller *phb; /* for pci devices */ > struct iommu_table *iommu_table; /* for phb's or bridges */ > + struct pci_dev *pcidev; /* back-pointer to the pci device */ > + u32 config_space[16]; /* saved PCI config space */ > > struct property *properties; > struct device_node *parent; How about a pointer to a struct of EEH fields? Folks are touchy about adding anything PCI-specific to device nodes, especially since most DNs aren't PCI at all. Thanks- John - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/