Re: Potential re(4) / netbsd-4 / i386 problem?
On 10/28/2010 9:42 AM, Brad du Plessis wrote: Hi, I've been seeing panics on a netbsd-4/i386 machine which appears to be related to the reception of oversized frames: re0: discarding oversize frame (len=8813) I've narrowed down the problem here to a specific change. Basically with netbsd-4 branch I see the failure, but if I revert only the file: ./src/sys/dev/mii/rgephy.c to netbsd-4-0-1-RELEASE the problem goes away. A quick update. My oversize frame problems are greatly reduced by reverting rgephy.c as above, I have however still seen 1 instance of an oversize frame on a system that was at the time experiencing very high disk I/O load. The network device stopped working on this system until the system was rebooted. So as I see it there appear to be 2 problems which may or may not be related: 1. Something in the netbsd-4 branch version of rgephy.c causes the system to experience a high number of what it thinks are oversize frames (on my hardware, given the nature of the network traffic in my test). Reverting this file to netbsd-4-0-1-RELEASE cures this. 2. Once the driver does handle an oversize frame, the kernel will either panic with what appears to be memory corruption or the LAN will stop working. Brad
Re: Potential re(4) / netbsd-4 / i386 problem?
Hi, I've been seeing panics on a netbsd-4/i386 machine which appears to be related to the reception of oversized frames: re0: discarding oversize frame (len=8813) I've narrowed down the problem here to a specific change. Basically with netbsd-4 branch I see the failure, but if I revert only the file: ./src/sys/dev/mii/rgephy.c to netbsd-4-0-1-RELEASE the problem goes away. Looking at the difference between the 2 revisions I would guess the most likely cause is the difference in register writes in rgephy_reset? Unfortunately for my purposes one of the two motherboard types I have exhibiting the problem has an RTL8111C which (without the netbsd-4 changes) fails to detect the media automatically (forcing it to 1000baseT has it sync at 100baseTX for some reason). Are there any changes I could make to the netbsd-4 rgephy.c to find a fix for this?(netbsd-5 has the same problem by the way) Thanks, Brad # cd /usr/src/sys/dev/mii # cvs diff -u -r netbsd-4-0-1-RELEASE -r netbsd-4 rgephy.c Index: rgephy.c === RCS file: /cvsroot/src/sys/dev/mii/rgephy.c,v retrieving revision 1.15 retrieving revision 1.15.2.1 diff -u -r1.15 -r1.15.2.1 --- rgephy.c29 Nov 2006 13:57:59 -1.15 +++ rgephy.c18 Aug 2009 09:46:50 -1.15.2.1 @@ -1,4 +1,4 @@ -/*$NetBSD: rgephy.c,v 1.15 2006/11/29 13:57:59 tsutsui Exp $*/ +/*$NetBSD: rgephy.c,v 1.15.2.1 2009/08/18 09:46:50 bouyer Exp $*/ /* * Copyright (c) 2003 @@ -33,7 +33,7 @@ */ #include sys/cdefs.h -__KERNEL_RCSID(0, $NetBSD: rgephy.c,v 1.15 2006/11/29 13:57:59 tsutsui Exp $); +__KERNEL_RCSID(0, $NetBSD: rgephy.c,v 1.15.2.1 2009/08/18 09:46:50 bouyer Exp $); /* @@ -61,7 +61,12 @@ static intrgephy_match(struct device *, struct cfdata *, void *); static voidrgephy_attach(struct device *, struct device *, void *); -CFATTACH_DECL(rgephy, sizeof(struct mii_softc), +struct rgephy_softc { +struct mii_softc mii_sc; +int mii_revision; +}; + +CFATTACH_DECL(rgephy, sizeof(struct rgephy_softc), rgephy_match, rgephy_attach, mii_phy_detach, mii_phy_activate); @@ -72,8 +77,6 @@ static voidrgephy_loop(struct mii_softc *); static voidrgephy_load_dspcode(struct mii_softc *); -static intrgephy_mii_model; - static const struct mii_phy_funcs rgephy_funcs = { rgephy_service, rgephy_status, rgephy_reset, }; @@ -103,19 +106,26 @@ static void rgephy_attach(struct device *parent, struct device *self, void *aux) { -struct mii_softc *sc = device_private(self); +struct rgephy_softc *rsc = device_private(self); +struct mii_softc *sc = rsc-mii_sc; struct mii_attach_args *ma = aux; struct mii_data *mii = ma-mii_data; const struct mii_phydesc *mpd; int rev; const char *sep = ; +rsc = device_private(self); +sc = rsc-mii_sc; +ma = aux; +mii = ma-mii_data; + rev = MII_REV(ma-mii_id2); mpd = mii_phy_match(ma, rgephys); aprint_naive(: Media interface\n); aprint_normal(: %s, rev. %d\n, mpd-mpd_name, rev); -sc-mii_mpd_model = rev;/* XXX miivar.h comment vs usage? */ +rsc-mii_revision = rev; + sc-mii_inst = mii-mii_instance; sc-mii_phy = ma-mii_phyno; sc-mii_pdata = mii; @@ -124,23 +134,14 @@ sc-mii_funcs = rgephy_funcs; -/* Don't do isolate on this PHY. */ -sc-mii_flags |= MIIF_NOISOLATE; - #defineADD(m, c)ifmedia_add(mii-mii_media, (m), (c), NULL) #definePRINT(n)aprint_normal(%s%s, sep, (n)); sep = , -#if 0 -ADD(IFM_MAKEWORD(IFM_ETHER, IFM_NONE, 0, sc-mii_inst), -BMCR_ISO); -#endif #ifdef __FreeBSD__ ADD(IFM_MAKEWORD(IFM_ETHER, IFM_100_TX, IFM_LOOP, sc-mii_inst), BMCR_LOOP|BMCR_S100); #endif -rgephy_mii_model = MII_MODEL(ma-mii_id2); - sc-mii_capabilities = PHY_READ(sc, MII_BMSR) ma-mii_capmask; sc-mii_capabilities = ~BMSR_ANEG; @@ -149,19 +150,11 @@ * media explicitly. Why? */ aprint_normal(%s: , sc-mii_dev.dv_xname); -#ifdef __FreeBSD__ -mii_phy_add_media(sc); -ADD(IFM_MAKEWORD(IFM_ETHER, IFM_1000_T, 0, sc-mii_inst), -RGEPHY_BMCR_FDX); -PRINT(, 1000baseTX); -ADD(IFM_MAKEWORD(IFM_ETHER, IFM_1000_T, IFM_FDX, sc-mii_inst), 0); -PRINT(1000baseTX-FDX); -#else if (sc-mii_capabilities BMSR_EXTSTAT) { sc-mii_extcapabilities = PHY_READ(sc, MII_EXTSR); } mii_phy_add_media(sc); -#endif + /* rtl8169S does not report auto-sense; add manually. */ ADD(IFM_MAKEWORD(IFM_ETHER, IFM_AUTO, 0, sc-mii_inst), MII_NMEDIA); sep =, ; @@ -177,9 +170,12 @@ static int rgephy_service(struct mii_softc *sc, struct mii_data *mii, int cmd) { +struct rgephy_softc *rsc; struct ifmedia_entry *ife = mii-mii_media.ifm_cur; int reg, speed, gig, anar; +rsc = (struct rgephy_softc *)sc; + switch (cmd) { case MII_POLLSTAT: /* @@ -254,7 +250,7 @@ } /* -
Re: Potential re(4) / netbsd-4 / i386 problem?
On 7/23/2010 12:54 PM, der Mouse wrote: Well, use machines whose designers cut corners on hardware design and guess what happens. Actually, my main reason for writing is to mention that I have a laptop, running 4.0.1, with an re onboard, and have never seen such random crashes. I can give more details if they matter. I've got 3 motherboards with re onboard that I've tested, 2 of the 3 have the problem. I checked the re hwrev and the one that works fine is 0x2800. The 2 boards that don't work have hwrev 0x3800 and 0x3C40. The board that's fine is a commercial Intel DG41MJ while the other 2 are both DFI industrial boards (LT600-DR, LT330-B).
Re: Potential re(4) / netbsd-4 / i386 problem?
On 7/26/2010 8:50 AM, der Mouse wrote: I don't see anything there that looks like the rev numbers you're talking about. While now is not a good time, I'll have a look at the code and see if I can find the hwrev value you're talking about and print out its value for my hardware. I manually printed it out in re_attach in rtl8169.c. Thanks, Brad
Re: Potential re(4) / netbsd-4 / i386 problem?
On 7/21/2010 3:17 PM, Izumi Tsutsui wrote: I'm afraid it's memory corruption caused by hardware or PCI BIOS problem. What happens if you change cfg = RE_CPLUSCMD_PCI_MRW; to cfg = 0; in re_init()? Or can you have newer BIOS for your hardware? I've tried this and I'm still able to reproduce the problem. On 7/23/2010 12:12 PM, Manuel Bouyer wrote: is it possible that the re device is writting past its buffer (via DMA) and overwriting random memory ? Is it possible that I can increase this buffer and check it for overrun when data arrives? If so, where can I find this buffer? Brad
Re: Potential re(4) / netbsd-4 / i386 problem?
On 3/5/2010 12:39 PM, Brad du Plessis wrote: On 3/5/2010 12:18 PM, Izumi Tsutsui wrote: I've been seeing panics on a netbsd-4/i386 machine which appears to be related to the reception of oversized frames: re0: discarding oversize frame (len=8813) : Can anyone help me? - options DIAGNOSTIC might help debug I'm busy trying to reproduce with options DIAGNOSTIC right now. So far it hasn't panicked in the last 3 hours. - does it happen on UP kernel (GENERIC, not GENERIC.MP), or netbsd-5? I've managed to reproduce this now in netbsd-5 too (source is about 3 months old, not sure if there have been any changes since): re0: discarding oversize frame (len=9041) re0: discarding oversize frame (len=16158) panic: kernel diagnostic assertion pcg-pcg_avail == 0 failed: file ../../../../kern_subr_pool.c, line 2580 As I think I've said before, the actual crash point is different every time but the panic is always preceded by the discarding oversize frame. Sometimes the len in the oversize frame message is len=-1. Any advice? Thanks, Brad
Re: Potential re(4) / netbsd-4 / i386 problem?
On 3/5/2010 12:18 PM, Izumi Tsutsui wrote: I've been seeing panics on a netbsd-4/i386 machine which appears to be related to the reception of oversized frames: re0: discarding oversize frame (len=8813) : Can anyone help me? - options DIAGNOSTIC might help debug Had a kernel running with options DIAGNOSTIC and while I didn't see any printout other than those I saw before, I was able to get a kgdb session up and I have a back trace. Printed out a few bits and pieces, not sure what would be of real interest: Program received signal SIGSEGV, Segmentation fault. 0xc05dba53 in ether_input (ifp=0xc21f503c, m=0xc24a4800) at ../../../../net/if_ethersubr.c:648 648 etype = ntohs(eh-ether_type); (gdb) bt #0 0xc05dba53 in ether_input (ifp=0xc21f503c, m=0xc24a4800) at ../../../../net/if_ethersubr.c:648 #1 0xc0382d93 in re_rxeof (sc=0xc21f5000) at ../../../../dev/ic/rtl8169.c:1374 #2 0xc03832a2 in re_intr (arg=0xc21f5000) at ../../../../dev/ic/rtl8169.c:1565 #3 0xc062a5d5 in intr_biglock_wrapper (vp=0xc21aef80) at ../../../../arch/x86/x86/intr.c:544 #4 0xc0108198 in Xintr_ioapic_level11 () #5 0xc21aef80 in ?? () #6 0x in ?? () (gdb) p eh $1 = (struct ether_header *) 0x58a07f87 (gdb) p m $2 = (struct mbuf *) 0xc24a4800 (gdb) p *eh Cannot access memory at address 0x58a07f87 (gdb) p *m $3 = {m_hdr = {mh_next = 0x388a43eb, mh_nextpkt = 0x3bd2a781, mh_data = 0x58a07f87 Address 0x58a07f87 out of bounds, mh_owner = 0x8e878f3c, mh_len = 1082, mh_flags = -187037215, mh_paddr = 515969279, mh_type = 27862}, M_dat = {MH = {MH_pkthdr = { rcvif = 0xc21f503c, tags = {slh_first = 0xaf76}, len = 1082, csum_flags = 197, csum_data = 2687232, segsz = 419436127}, MH_dat = {MH_ext = { ext_buf = 0x1d010ad0 Address 0x1d010ad0 out of bounds, ext_free = 0x450008, ext_arg = 0x84912800, ext_size = 104857600, ext_type = 0xa8c03063, ext_nextref = 0xa8c00102, ext_prevref = 0x657dca02, ext_un = {extun_paddr = 1729288689, extun_pgs = {0x6712d9f1, 0x9d4a3f62, 0x1050e731, 0xeae401b, 0x0, 0x0, 0xd0dcbbd0, 0x5b2500f, 0x0, 0x290100, 0x1900165f, 0x80010ad0, 0x450008, 0x7dc2a005, 0x640, 0xa8c0b12c, 0xa8c00f02}}, ext_ofile = 0x667dca02 Address 0x667dca02 out of bounds, ext_nfile = 0xd00e21ff Address 0xd00e21ff out of bounds, ext_oline = 2029390742, ext_nline = 407935915}, MH_databuf = Ð\n\001\035\b\000E\000\000(\221\204\000\...@\006c0À¨\002\001À¨\002Ê}eñÙ\022gb?j\2351 çp\020\...@®\016\000\000\000\000\000\000\000\000лÜÐ\017p²\005\000\000\000\000\000\001)\000_\026\000\031Ð\ n\001\200\b\000E\000\005 Â}\000\...@\006,±À¨\002\017À¨\002Ê}fÿ!\016Ð\226\vöx«\233p\030\...@ÕÏ\000\0003--+ ,.+,*')\001\017\017\020\020\020\020\021\017\020\020\020\020\020\017\017\017\021\020\020\020\017\017\020\02 0\017\020\020\021\017\f\020\020\020\020\020\017\017\017\020\021\017\017\022\031'=Rbmty\177|~}\200\202\200\ 177\177\201~}~\201\201\201\203\205}}, M_databuf = P\037Âv¯\000\000:\004\000\000Å\000\000\000\000\001)\000_\026\000\031Ð\n\001\035\b\000E\0 00\000(\221\204\000\...@\006c0À¨\002\001À¨\002Ê}eñÙ\022gb?j\2351çp\020\033@®\016\000\000\000\000\000\000\0 00\000лÜÐ\017P²\005\000\000\000\000\000\001)\000_\026\000\031Ð\n\001\200\b\000E\000\005 Â}\000\...@\006,± À¨\002\017À¨\002Ê}fÿ!\016Ð\226\vöx«\233p\030\...@ÕÏ\000\0003--+,.+,*')\001\017\017\020\020\020\020\021\01 7\020\020\020\020\020\017\017\017\021\020\020\020\017\017\020\020\017\020\020\021\017\f\020\020\020\020\02 0\017\017\017\020\021\017\017\022\031'...}} (gdb) up #1 0xc0382d93 in re_rxeof (sc=0xc21f5000) at ../../../../dev/ic/rtl8169.c:1374 1374(*ifp-if_input)(ifp, m); (gdb) p ifp $4 = (struct ifnet *) 0xc21f503c (gdb) p *ifp $5 = {if_softc = 0xc21f5000, if_list = {tqe_next = 0xc2253400, tqe_prev = 0xc0c98f80}, if_addrlist = { tqh_first = 0xc21dfe80, tqh_last = 0xc22dbb10}, if_xname = re0, '\0' repeats 12 times, if_pcount = 0, if_bpf = 0x0, if_index = 1, if_timer = 0, if_flags = -30653, if__pad1 = 0, if_data = { ifi_type = 6 '\006', ifi_addrlen = 6 '\006', ifi_hdrlen = 14 '\016', ifi_link_state = 2, ifi_mtu = 1500, ifi_metric = 0, ifi_baudrate = 10, ifi_ipackets = 13724322, ifi_ierrors = 5, ifi_opackets = 4730575, ifi_oerrors = 0, ifi_collisions = 0, ifi_ibytes = 11913686395, ifi_obytes = 256544336, ifi_imcasts = 12006, ifi_omcasts = 5, ifi_iqdrops = 0, ifi_noproto = 0, ifi_lastchange = {tv_sec = 0, tv_usec = 0}}, if_output = 0xc05db01c ether_output, if_input = 0xc05dba19 ether_input, if_start = 0xc0383326 re_start, if_ioctl = 0xc0384654 re_ioctl, if_init = 0xc0383ca1 re_init, if_stop = 0xc03847e6 re_stop, if_watchdog = 0xc0384774 re_watchdog, if_drain = 0, if_snd = {ifq_head = 0x0, ifq_tail = 0x0, ifq_len = 0,