Re: Potential re(4) / netbsd-4 / i386 problem?

2011-01-03 Thread Brad du Plessis



On 10/28/2010 9:42 AM, Brad du Plessis wrote:

Hi,


I've been seeing panics on a netbsd-4/i386 machine which appears to be
related to the reception of oversized frames:

re0: discarding oversize frame (len=8813) 


I've narrowed down the problem here to a specific change.
Basically with netbsd-4 branch I see the failure, but if I revert
only the file:

./src/sys/dev/mii/rgephy.c

to netbsd-4-0-1-RELEASE the problem goes away.


A quick update.

My oversize frame problems are greatly reduced by
reverting rgephy.c as above, I have however still seen
1 instance of an oversize frame on a system that was at
the time experiencing very high disk I/O load. The
network device stopped working on this system until the
system was rebooted.

So as I see it there appear to be 2 problems which may or
may not be related:

1. Something in the netbsd-4 branch version of rgephy.c
   causes the system to experience a high number of what
   it thinks are oversize frames (on my hardware, given the
   nature of the network traffic in my test). Reverting this file to
   netbsd-4-0-1-RELEASE cures this.

2. Once the driver does handle an oversize frame, the
   kernel will either panic with what appears to be memory
   corruption or the LAN will stop working.


 Brad


Re: Potential re(4) / netbsd-4 / i386 problem?

2010-10-28 Thread Brad du Plessis

Hi,


I've been seeing panics on a netbsd-4/i386 machine which appears to be
related to the reception of oversized frames:

re0: discarding oversize frame (len=8813) 


I've narrowed down the problem here to a specific change.
Basically with netbsd-4 branch I see the failure, but if I revert
only the file:

./src/sys/dev/mii/rgephy.c

to netbsd-4-0-1-RELEASE the problem goes away. Looking at the difference
between the 2 revisions I would guess the most likely cause is the
difference in register writes in rgephy_reset?

Unfortunately for my purposes one of the two motherboard types I have
exhibiting the problem has an RTL8111C which (without the netbsd-4 changes)
fails to detect the media automatically (forcing it to 1000baseT has it 
sync

at 100baseTX for some reason).


Are there any changes I could make to the netbsd-4 rgephy.c to find a
fix for this?(netbsd-5 has the same problem by the way)

Thanks,
 Brad


# cd /usr/src/sys/dev/mii
# cvs diff -u -r netbsd-4-0-1-RELEASE -r netbsd-4 rgephy.c

Index: rgephy.c
===
RCS file: /cvsroot/src/sys/dev/mii/rgephy.c,v
retrieving revision 1.15
retrieving revision 1.15.2.1
diff -u -r1.15 -r1.15.2.1
--- rgephy.c29 Nov 2006 13:57:59 -1.15
+++ rgephy.c18 Aug 2009 09:46:50 -1.15.2.1
@@ -1,4 +1,4 @@
-/*$NetBSD: rgephy.c,v 1.15 2006/11/29 13:57:59 tsutsui Exp $*/
+/*$NetBSD: rgephy.c,v 1.15.2.1 2009/08/18 09:46:50 bouyer Exp $*/

 /*
  * Copyright (c) 2003
@@ -33,7 +33,7 @@
  */

 #include sys/cdefs.h
-__KERNEL_RCSID(0, $NetBSD: rgephy.c,v 1.15 2006/11/29 13:57:59 tsutsui 
Exp $);
+__KERNEL_RCSID(0, $NetBSD: rgephy.c,v 1.15.2.1 2009/08/18 09:46:50 
bouyer Exp $);



 /*
@@ -61,7 +61,12 @@
 static intrgephy_match(struct device *, struct cfdata *, void *);
 static voidrgephy_attach(struct device *, struct device *, void *);

-CFATTACH_DECL(rgephy, sizeof(struct mii_softc),
+struct rgephy_softc {
+struct mii_softc mii_sc;
+int mii_revision;
+};
+
+CFATTACH_DECL(rgephy, sizeof(struct rgephy_softc),
 rgephy_match, rgephy_attach, mii_phy_detach, mii_phy_activate);


@@ -72,8 +77,6 @@
 static voidrgephy_loop(struct mii_softc *);
 static voidrgephy_load_dspcode(struct mii_softc *);

-static intrgephy_mii_model;
-
 static const struct mii_phy_funcs rgephy_funcs = {
 rgephy_service, rgephy_status, rgephy_reset,
 };
@@ -103,19 +106,26 @@
 static void
 rgephy_attach(struct device *parent, struct device *self, void *aux)
 {
-struct mii_softc *sc = device_private(self);
+struct rgephy_softc *rsc = device_private(self);
+struct mii_softc *sc = rsc-mii_sc;
 struct mii_attach_args *ma = aux;
 struct mii_data *mii = ma-mii_data;
 const struct mii_phydesc *mpd;
 int rev;
 const char *sep = ;

+rsc = device_private(self);
+sc = rsc-mii_sc;
+ma = aux;
+mii = ma-mii_data;
+
 rev = MII_REV(ma-mii_id2);
 mpd = mii_phy_match(ma, rgephys);
 aprint_naive(: Media interface\n);
 aprint_normal(: %s, rev. %d\n, mpd-mpd_name, rev);

-sc-mii_mpd_model = rev;/* XXX miivar.h comment vs usage? */
+rsc-mii_revision = rev;
+
 sc-mii_inst = mii-mii_instance;
 sc-mii_phy = ma-mii_phyno;
 sc-mii_pdata = mii;
@@ -124,23 +134,14 @@

 sc-mii_funcs = rgephy_funcs;

-/* Don't do isolate on this PHY. */
-sc-mii_flags |= MIIF_NOISOLATE;
-
 #defineADD(m, c)ifmedia_add(mii-mii_media, (m), (c), NULL)
 #definePRINT(n)aprint_normal(%s%s, sep, (n)); sep = , 

-#if 0
-ADD(IFM_MAKEWORD(IFM_ETHER, IFM_NONE, 0, sc-mii_inst),
-BMCR_ISO);
-#endif
 #ifdef __FreeBSD__
 ADD(IFM_MAKEWORD(IFM_ETHER, IFM_100_TX, IFM_LOOP, sc-mii_inst),
 BMCR_LOOP|BMCR_S100);
 #endif

-rgephy_mii_model = MII_MODEL(ma-mii_id2);
-
 sc-mii_capabilities = PHY_READ(sc, MII_BMSR)  ma-mii_capmask;
 sc-mii_capabilities = ~BMSR_ANEG;

@@ -149,19 +150,11 @@
  * media explicitly. Why?
  */
 aprint_normal(%s: , sc-mii_dev.dv_xname);
-#ifdef __FreeBSD__
-mii_phy_add_media(sc);
-ADD(IFM_MAKEWORD(IFM_ETHER, IFM_1000_T, 0, sc-mii_inst),
-RGEPHY_BMCR_FDX);
-PRINT(, 1000baseTX);
-ADD(IFM_MAKEWORD(IFM_ETHER, IFM_1000_T, IFM_FDX, sc-mii_inst), 0);
-PRINT(1000baseTX-FDX);
-#else
 if (sc-mii_capabilities  BMSR_EXTSTAT) {
 sc-mii_extcapabilities = PHY_READ(sc, MII_EXTSR);
 }
 mii_phy_add_media(sc);
-#endif
+
 /* rtl8169S does not report auto-sense; add manually.  */
 ADD(IFM_MAKEWORD(IFM_ETHER, IFM_AUTO, 0, sc-mii_inst), MII_NMEDIA);
 sep =, ;
@@ -177,9 +170,12 @@
 static int
 rgephy_service(struct mii_softc *sc, struct mii_data *mii, int cmd)
 {
+struct rgephy_softc *rsc;
 struct ifmedia_entry *ife = mii-mii_media.ifm_cur;
 int reg, speed, gig, anar;

+rsc = (struct rgephy_softc *)sc;
+
 switch (cmd) {
 case MII_POLLSTAT:
 /*
@@ -254,7 +250,7 @@
 }

 /*
-   

Re: Potential re(4) / netbsd-4 / i386 problem?

2010-07-26 Thread Brad du Plessis



On 7/23/2010 12:54 PM, der Mouse wrote:

Well, use machines whose designers cut corners on hardware design and
guess what happens.

Actually, my main reason for writing is to mention that I have a
laptop, running 4.0.1, with an re onboard, and have never seen such
random crashes.  I can give more details if they matter.
   


I've got 3 motherboards with re onboard that I've tested, 2 of the 3 
have the problem.
I checked the re hwrev and the one that works fine is 0x2800. The 2 
boards that don't
work have hwrev 0x3800 and 0x3C40. The board that's fine is a 
commercial
Intel DG41MJ while the other 2 are both DFI industrial boards (LT600-DR, 
LT330-B).


Re: Potential re(4) / netbsd-4 / i386 problem?

2010-07-26 Thread Brad du Plessis



On 7/26/2010 8:50 AM, der Mouse wrote:

I don't see anything there that looks like the rev numbers you're
talking about.  While now is not a good time, I'll have a look at the
code and see if I can find the hwrev value you're talking about and
print out its value for my hardware.


I manually printed it out in re_attach in rtl8169.c.

Thanks,
 Brad


Re: Potential re(4) / netbsd-4 / i386 problem?

2010-07-23 Thread Brad du Plessis


On 7/21/2010 3:17 PM, Izumi Tsutsui wrote:

I'm afraid it's memory corruption caused by hardware or PCI BIOS problem.

What happens if you change
   

cfg = RE_CPLUSCMD_PCI_MRW;
 

to
   

cfg = 0;
 

in re_init()? Or can you have newer BIOS for your hardware?
   


I've tried this and I'm still able to reproduce the problem.


On 7/23/2010 12:12 PM, Manuel Bouyer wrote:

is it possible that the re device is writting past its buffer (via DMA) and
overwriting random memory ?
   


Is it possible that I can increase this buffer and check it for overrun 
when data arrives? If so, where can I find this buffer?


Brad


Re: Potential re(4) / netbsd-4 / i386 problem?

2010-07-22 Thread Brad du Plessis

On 3/5/2010 12:39 PM, Brad du Plessis wrote:


On 3/5/2010 12:18 PM, Izumi Tsutsui wrote:

I've been seeing panics on a netbsd-4/i386 machine which appears to be
related to the reception of oversized frames:

re0: discarding oversize frame (len=8813)

:

Can anyone help me?


- options DIAGNOSTIC might help debug


I'm busy trying to reproduce with options DIAGNOSTIC right now. So far 
it hasn't panicked in the last 3 hours.



- does it happen on UP kernel (GENERIC, not GENERIC.MP), or netbsd-5?


I've managed to reproduce this now in netbsd-5 too (source is about 3 
months old, not sure if there have been any changes since):


re0: discarding oversize frame (len=9041)
re0: discarding oversize frame (len=16158)
panic: kernel diagnostic assertion pcg-pcg_avail == 0 failed: file 
../../../../kern_subr_pool.c, line 2580



As I think I've said before, the actual crash point is different
every time but the panic is always preceded by the discarding
oversize frame. Sometimes the len in the oversize frame message
is len=-1.

Any advice?

Thanks,
 Brad


Re: Potential re(4) / netbsd-4 / i386 problem?

2010-03-08 Thread Brad du Plessis


On 3/5/2010 12:18 PM, Izumi Tsutsui wrote:

I've been seeing panics on a netbsd-4/i386 machine which appears to be
related to the reception of oversized frames:

re0: discarding oversize frame (len=8813)

  :

Can anyone help me?


- options DIAGNOSTIC might help debug


Had a kernel running with options DIAGNOSTIC and while I didn't see any 
printout other than those
I saw before, I was able to get a kgdb session up and I have a back 
trace. Printed out a few bits

and pieces, not sure what would be of real interest:

Program received signal SIGSEGV, Segmentation fault.
0xc05dba53 in ether_input (ifp=0xc21f503c, m=0xc24a4800) at 
../../../../net/if_ethersubr.c:648

648 etype = ntohs(eh-ether_type);
(gdb) bt
#0  0xc05dba53 in ether_input (ifp=0xc21f503c, m=0xc24a4800) at 
../../../../net/if_ethersubr.c:648
#1  0xc0382d93 in re_rxeof (sc=0xc21f5000) at 
../../../../dev/ic/rtl8169.c:1374
#2  0xc03832a2 in re_intr (arg=0xc21f5000) at 
../../../../dev/ic/rtl8169.c:1565
#3  0xc062a5d5 in intr_biglock_wrapper (vp=0xc21aef80) at 
../../../../arch/x86/x86/intr.c:544

#4  0xc0108198 in Xintr_ioapic_level11 ()
#5  0xc21aef80 in ?? ()
#6  0x in ?? ()
(gdb) p eh
$1 = (struct ether_header *) 0x58a07f87
(gdb) p m
$2 = (struct mbuf *) 0xc24a4800
(gdb) p *eh
Cannot access memory at address 0x58a07f87
(gdb) p *m
$3 = {m_hdr = {mh_next = 0x388a43eb, mh_nextpkt = 0x3bd2a781,
mh_data = 0x58a07f87 Address 0x58a07f87 out of bounds, mh_owner = 
0x8e878f3c, mh_len = 1082,
mh_flags = -187037215, mh_paddr = 515969279, mh_type = 27862}, 
M_dat = {MH = {MH_pkthdr = {
rcvif = 0xc21f503c, tags = {slh_first = 0xaf76}, len = 1082, 
csum_flags = 197,

csum_data = 2687232, segsz = 419436127}, MH_dat = {MH_ext = {
  ext_buf = 0x1d010ad0 Address 0x1d010ad0 out of bounds, 
ext_free = 0x450008,
  ext_arg = 0x84912800, ext_size = 104857600, ext_type = 
0xa8c03063, ext_nextref = 0xa8c00102,
  ext_prevref = 0x657dca02, ext_un = {extun_paddr = 1729288689, 
extun_pgs = {0x6712d9f1,
  0x9d4a3f62, 0x1050e731, 0xeae401b, 0x0, 0x0, 0xd0dcbbd0, 
0x5b2500f, 0x0, 0x290100,
  0x1900165f, 0x80010ad0, 0x450008, 0x7dc2a005, 0x640, 
0xa8c0b12c, 0xa8c00f02}},

  ext_ofile = 0x667dca02 Address 0x667dca02 out of bounds,
  ext_nfile = 0xd00e21ff Address 0xd00e21ff out of bounds, 
ext_oline = 2029390742,

  ext_nline = 407935915},
MH_databuf = 
Ð\n\001\035\b\000E\000\000(\221\204\000\...@\006c0À¨\002\001À¨\002Ê}eñÙ\022gb?j\2351

çp\020\...@®\016\000\000\000\000\000\000\000\000лÜÐ\017p²\005\000\000\000\000\000\001)\000_\026\000\031Ð\
n\001\200\b\000E\000\005 
Â}\000\...@\006,±À¨\002\017À¨\002Ê}fÿ!\016Ð\226\vöx«\233p\030\...@ÕÏ\000\0003--+
,.+,*')\001\017\017\020\020\020\020\021\017\020\020\020\020\020\017\017\017\021\020\020\020\017\017\020\02
0\017\020\020\021\017\f\020\020\020\020\020\017\017\017\020\021\017\017\022\031'=Rbmty\177|~}\200\202\200\
177\177\201~}~\201\201\201\203\205}},
M_databuf = 
P\037Âv¯\000\000:\004\000\000Å\000\000\000\000\001)\000_\026\000\031Ð\n\001\035\b\000E\0

00\000(\221\204\000\...@\006c0À¨\002\001À¨\002Ê}eñÙ\022gb?j\2351çp\020\033@®\016\000\000\000\000\000\000\0
00\000лÜÐ\017P²\005\000\000\000\000\000\001)\000_\026\000\031Ð\n\001\200\b\000E\000\005 Â}\000\...@\006,± 
À¨\002\017À¨\002Ê}fÿ!\016Ð\226\vöx«\233p\030\...@ÕÏ\000\0003--+,.+,*')\001\017\017\020\020\020\020\021\01 
7\020\020\020\020\020\017\017\017\021\020\020\020\017\017\020\020\017\020\020\021\017\f\020\020\020\020\02 
0\017\017\017\020\021\017\017\022\031'...}}

(gdb) up
#1  0xc0382d93 in re_rxeof (sc=0xc21f5000) at 
../../../../dev/ic/rtl8169.c:1374

1374(*ifp-if_input)(ifp, m);
(gdb) p ifp
$4 = (struct ifnet *) 0xc21f503c
(gdb) p *ifp
$5 = {if_softc = 0xc21f5000, if_list = {tqe_next = 0xc2253400, tqe_prev 
= 0xc0c98f80}, if_addrlist = {
tqh_first = 0xc21dfe80, tqh_last = 0xc22dbb10}, if_xname = re0, 
'\0' repeats 12 times,
  if_pcount = 0, if_bpf = 0x0, if_index = 1, if_timer = 0, if_flags = 
-30653, if__pad1 = 0, if_data = {
ifi_type = 6 '\006', ifi_addrlen = 6 '\006', ifi_hdrlen = 14 
'\016', ifi_link_state = 2,
ifi_mtu = 1500, ifi_metric = 0, ifi_baudrate = 10, 
ifi_ipackets = 13724322, ifi_ierrors = 5,
ifi_opackets = 4730575, ifi_oerrors = 0, ifi_collisions = 0, 
ifi_ibytes = 11913686395,
ifi_obytes = 256544336, ifi_imcasts = 12006, ifi_omcasts = 5, 
ifi_iqdrops = 0, ifi_noproto = 0,
ifi_lastchange = {tv_sec = 0, tv_usec = 0}}, if_output = 0xc05db01c 
ether_output,

  if_input = 0xc05dba19 ether_input, if_start = 0xc0383326 re_start,
  if_ioctl = 0xc0384654 re_ioctl, if_init = 0xc0383ca1 re_init, 
if_stop = 0xc03847e6 re_stop,
  if_watchdog = 0xc0384774 re_watchdog, if_drain = 0, if_snd = 
{ifq_head = 0x0, ifq_tail = 0x0,
ifq_len = 0,