On Sat, 2 Jan 2010, Sean wrote: > Hmm, I applied the changes and I did not see a place where *prev differs > from td->td_hash. I have run memtest86+ on this box and it has passed 16 > times, so I do not suspect a hardware memory error. What do you think? > Attached is the latest dmesg output.
I don't know. The same pattern as before appears here: $ egrep -n '[1b]e(40|5c)' dmesg3.log 167:kobject: 'fs' (c7801e40): kobject_add_internal: parent: '<NULL>', set: '<NULL>' 4990:ohci_hcd 0000:00:0b.0: td alloc for 2 ep85: c6691e40 5023:ohci_hcd 0000:00:0b.0: hash c6691e40 to 57 -> (null) 5181:ohci_hcd 0000:00:0b.0: td alloc for 2 ep85: c676be40 5214:ohci_hcd 0000:00:0b.0: hash c676be40 to 57 -> c6691e40 5271:ohci_hcd 0000:00:0b.0: td free c6691e40 5272:ohci_hcd 0000:00:0b.0: (57 1) c676be5c -> (null) [(null)] 5294:ohci_hcd 0000:00:0b.0: td alloc for 2 ep85: c6691e40 5327:ohci_hcd 0000:00:0b.0: hash c6691e40 to 57 -> c676be40 5533:ohci_hcd 0000:00:0b.0: td free c676be40 5534:ohci_hcd 0000:00:0b.0: (57 1) c6691e5c -> (null) [(null)] 5556:ohci_hcd 0000:00:0b.0: td alloc for 2 ep85: c676be40 5589:ohci_hcd 0000:00:0b.0: hash c676be40 to 57 -> c6691e40 5640:ohci_hcd 0000:00:0b.0: td free c6691e40 5641:ohci_hcd 0000:00:0b.0: (57 1) c676be5c -> c676be40 [c676be40] 5713:ohci_hcd 0000:00:0b.0: td alloc for 2 ep85: c6691e40 5746:ohci_hcd 0000:00:0b.0: hash c6691e40 to 57 -> c676be40 5899:ohci_hcd 0000:00:0b.0: td free c676be40 5900:ohci_hcd 0000:00:0b.0: (57 1) c6691e5c -> c676be40 [c676be40] At line 5641 we see that the td_hash pointer in c676be40 gets corrupted. It is copied from the pointer in c6691e40, which was set to NULL in line 5534, but now it points to c676be40. The question is whether this corruption was caused by a hardware fault or a software bug. We have added debugging printouts to the only two places where the driver assigns anything to td->td_hash, and they don't show anything wrong. This leads me to suspect the hardware, but of course this is still just a guess. Here is a completely new patch. This one is more directed, to catch the sort of errors we now know to look for. Alan Stern Index: usb-2.6/drivers/usb/host/ohci-mem.c =================================================================== --- usb-2.6.orig/drivers/usb/host/ohci-mem.c +++ usb-2.6/drivers/usb/host/ohci-mem.c @@ -98,17 +98,56 @@ td_alloc (struct ohci_hcd *hc, gfp_t mem return td; } +static void td_check(struct ohci_hcd *hc, int hash, char *msg) +{ + struct td *td, *first; + + first = hc->td_hash[hash]; + for (td = first; td; td = td->td_hash) { + if (td->td_hash == first || td->td_hash == td) { + ohci_err(hc, "Circular pointer %s: %d %p %p %p\n", + msg, hash, first, td, td->td_hash); + td->td_hash = NULL; + return; + } + } +} + +static void td_check_all(struct ohci_hcd *hc, char *msg) +{ + int hash; + + for (hash = 0; hash < TD_HASH_SIZE; ++hash) + td_check(hc, hash, msg); +} + static void td_free (struct ohci_hcd *hc, struct td *td) { - struct td **prev = &hc->td_hash [TD_HASH_FUNC (td->td_dma)]; + int hash = TD_HASH_FUNC(td->td_dma); + struct td **prev = &hc->td_hash[hash]; - while (*prev && *prev != td) + td_check(hc, hash, "#1a"); + while (*prev && *prev != td) { + if ((unsigned long) *prev == 0xa7a7a7a7) { + ohci_err(hc, "poisoned hash at %p (%d) %p %p\n", prev, + hash, td, hc->td_hash[hash]); + return; + } prev = &(*prev)->td_hash; - if (*prev) + } + if (*prev) { *prev = td->td_hash; + if (*prev == td) { + ohci_err(hc, "invalid hash at %p (%d) %p %p\n", prev, + hash, td, hc->td_hash[hash]); + *prev = NULL; + } + } else if ((td->hwINFO & cpu_to_hc32(hc, TD_DONE)) != 0) ohci_dbg (hc, "no hash for td %p\n", td); + mb(); + td_check(hc, hash, "#1b"); dma_pool_free (hc->td_cache, td, td->td_dma); } Index: usb-2.6/drivers/usb/host/ohci-q.c =================================================================== --- usb-2.6.orig/drivers/usb/host/ohci-q.c +++ usb-2.6/drivers/usb/host/ohci-q.c @@ -558,12 +558,14 @@ td_fill (struct ohci_hcd *ohci, u32 info /* hash it for later reverse mapping */ hash = TD_HASH_FUNC (td->td_dma); + td_check(ohci, hash, "#2a"); td->td_hash = ohci->td_hash [hash]; ohci->td_hash [hash] = td; /* HC might read the TD (or cachelines) right away ... */ wmb (); td->ed->hwTailP = td->hwNextTD; + td_check(ohci, hash, "#2b"); } /*-------------------------------------------------------------------------*/ @@ -1127,9 +1129,11 @@ dl_done_list (struct ohci_hcd *ohci) { struct td *td = dl_reverse_done_list (ohci); + td_check_all(ohci, "#3a"); while (td) { struct td *td_next = td->next_dl_td; takeback_td(ohci, td); td = td_next; } + td_check_all(ohci, "#3b"); } -- To unsubscribe from this list: send the line "unsubscribe linux-media" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html