On Mon, Oct 21, 2019 at 11:19:57AM +0200, Matthieu Herrb wrote: > On Mon, Oct 21, 2019 at 09:52:57AM +0100, Stuart Henderson wrote: > > On 2019/10/21 10:44, Matthieu Herrb wrote: > > > I've observed this on several of our VMs after upgrading them to OpenBSD > > > 6.6. > > > > This has been a problem for ages with no suggested ideas for what might be > > wrong. I suggest replacing the virtual nics with e1000 if you want them to > > actually work.. > > The strange thing is that the VMs have been working for me since at > least 6.3 without ever stumbling on this problem. > > On the critical VM bumping the RAM to 1024M seems to have "fixed" the > issue. It has been running stable for almost 48h now. >
I've done some more experiments. It it clear now that it is caused by memory pressure. I'm experimenting with the small memory-consuming program below, and some printfs added (patches below too) and here is a sample run. This is on a VM with 256MB of RAM. #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <string.h> int main(int argc, char *argv[]) { int i; char *buf; for (i = 0; i < 8192; i++) { printf("malloc(%dM)\n", i); buf = malloc(i*1024*1024); if (buf != NULL) { memset(buf, 0x55, i*1024*1024); free(buf); printf("success\n"); } else { printf("failure\n"); exit(0); } } } Driver patch: Index: if_vmx.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_vmx.c,v retrieving revision 1.50 diff -u -r1.50 if_vmx.c --- if_vmx.c 6 Aug 2019 10:54:40 -0000 1.50 +++ if_vmx.c 21 Oct 2019 13:15:43 -0000 @@ -507,13 +507,17 @@ u_int slots; for (slots = if_rxr_get(&ring->rxr, NRXDESC); slots > 0; slots--) { - if (vmxnet3_getbuf(sc, ring)) + if (vmxnet3_getbuf(sc, ring)) { + printf("getbuf error slots %d\n", slots); break; + } } if_rxr_put(&ring->rxr, slots); - if (if_rxr_inuse(&ring->rxr) == 0) + if (if_rxr_inuse(&ring->rxr) == 0) { + printf("timeout_add\n"); timeout_add(&ring->refill, 1); + } } void @@ -923,7 +927,7 @@ int btype; if (ring->m[idx]) - panic("vmxnet3_getbuf: buffer has mbuf"); + panic("vmxnet3_getbuf: buffer has mbuf idx %d %p", idx, ring->m[idx]); #if 1 /* XXX Don't allocate buffers for ring 2 for now. */ @@ -938,9 +942,10 @@ #endif m = MCLGETI(NULL, M_DONTWAIT, NULL, JUMBO_LEN); - if (m == NULL) + if (m == NULL) { + printf("MCLGETI returns NULL\n"); return -1; - + } m->m_pkthdr.len = m->m_len = JUMBO_LEN; m_adj(m, ETHER_ALIGN); ring->m[idx] = m; Results MCLGETI returns NULL getbuf error slots 9 MCLGETI returns NULL getbuf error slots 78 MCLGETI returns NULL getbuf error slots 93 timeout_add MCLGETI returns NULL getbuf error slots 93 timeout_add MCLGETI returns NULL getbuf error slots 14 MCLGETI returns NULL getbuf error slots 33 MCLGETI returns NULL getbuf error slots 55 MCLGETI returns NULL getbuf error slots 71 MCLGETI returns NULL getbuf error slots 81 MCLGETI returns NULL getbuf error slots 95 timeout_add MCLGETI returns NULL getbuf error slots 95 timeout_add MCLGETI returns NULL getbuf error slots 96 timeout_add MCLGETI returns NULL getbuf error slots 16 MCLGETI returns NULL getbuf error slots 38 MCLGETI returns NULL getbuf error slots 57 MCLGETI returns NULL getbuf error slots 78 MCLGETI returns NULL getbuf error slots 97 timeout_add MCLGETI returns NULL getbuf error slots 98 timeout_add MCLGETI returns NULL getbuf error slots 99 timeout_add MCLGETI returns NULL getbuf error slots 100 timeout_add MCLGETI returns NULL getbuf error slots 101 timeout_add MCLGETI returns NULL getbuf error slots 102 timeout_add MCLGETI returns NULL getbuf error slots 103 timeout_add MCLGETI returns NULL getbuf error slots 104 timeout_add MCLGETI returns NULL getbuf error slots 105 timeout_add MCLGETI returns NULL getbuf error slots 106 timeout_add MCLGETI returns NULL getbuf error slots 1 MCLGETI returns NULL getbuf error slots 63 MCLGETI returns NULL getbuf error slots 108 timeout_add MCLGETI returns NULL getbuf error slots 108 timeout_add MCLGETI returns NULL getbuf error slots 1 MCLGETI returns NULL getbuf error slots 52 MCLGETI returns NULL getbuf error slots 88 MCLGETI returns NULL getbuf error slots 109 MCLGETI returns NULL getbuf error slots 17 MCLGETI returns NULL getbuf error slots 70 MCLGETI returns NULL getbuf error slots 110 timeout_add MCLGETI returns NULL getbuf error slots 3 MCLGETI returns NULL getbuf error slots 66 MCLGETI returns NULL getbuf error slots 111 timeout_add MCLGETI returns NULL getbuf error slots 112 timeout_add MCLGETI returns NULL getbuf error slots 113 timeout_add MCLGETI returns NULL getbuf error slots 114 timeout_add MCLGETI returns NULL getbuf error slots 115 timeout_add MCLGETI returns NULL getbuf error slots 116 timeout_add MCLGETI returns NULL getbuf error slots 117 timeout_add MCLGETI returns NULL getbuf error slots 118 timeout_add MCLGETI returns NULL getbuf error slots 119 timeout_add MCLGETI returns NULL getbuf error slots 120 timeout_add MCLGETI returns NULL getbuf error slots 121 timeout_add MCLGETI returns NULL getbuf error slots 122 timeout_add MCLGETI returns NULL getbuf error slots 123 timeout_add MCLGETI returns NULL getbuf error slots 124 timeout_add MCLGETI returns NULL getbuf error slots 8 MCLGETI returns NULL getbuf error slots 83 MCLGETI returns NULL getbuf error slots 126 timeout_add MCLGETI returns NULL getbuf error slots 126 timeout_add panic: vmxnet3_getbuf: buffer has mbuf idx 326 0xfffffd8003d58d00 Stopped at db_enter+0x10: popq %rbp TID PID UID PRFLAGS PFLAGS CPU COMMAND * 96708 71551 0 0x3 0 0 a.out db_enter() at db_enter+0x10 panic() at panic+0x128 vmxnet3_getbuf(ffff8000001bc000,ffff8000001be680) at vmxnet3_getbuf+0x161 vmxnet3_rxfill(ffff8000001be680) at vmxnet3_rxfill+0x4b vmxnet3_rxintr(ffff8000001bc000,ffff8000001be680) at vmxnet3_rxintr+0x2e5 vmxnet3_intr_intx(ffff8000001bc000) at vmxnet3_intr_intx+0x72 intr_handler(ffff800007250a30,ffff80000020d680) at intr_handler+0x3a Xintr_ioapic_level10_untramp(4,0,2,0,ffff800007250c30,78) at Xintr_ioapic_level10_untramp+0x1a3 kprintf() at kprintf+0x97c panic() at panic+0x8e vmxnet3_getbuf(ffff8000001bc000,ffff8000001be680) at vmxnet3_getbuf+0x161 vmxnet3_rxfill(ffff8000001be680) at vmxnet3_rxfill+0x4b softclock(0) at softclock+0x125 softintr_dispatch(0) at softintr_dispatch+0xd2 end trace frame: 0xffff800007250e00, count: 0 https://www.openbsd.org/ddb.html describes the minimum info required in bug -- Matthieu Herrb