Hi Eelco,

I think it's either a bug in kernel or my misunderstanding about how to
process the xsk cq ring. I posted the issue here
https://marc.info/?l=xdp-newbies&m=156055471727857&w=2

And apply this to v11 fix my crash.
Do you mind testing it again on your system?
Thanks for your time for trial and error

diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index a6543e8f5126..800c047b71f9 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -705,6 +705,7 @@ afxdp_complete_tx(struct xsk_socket_info *xsk)
     struct umem_elem *elems_push[BATCH_SIZE];
     uint32_t idx_cq = 0;
     int tx_done, j, ret;
+    int tx = 0;

     if (!xsk->outstanding_tx) {
         return;
@@ -717,23 +718,29 @@ afxdp_complete_tx(struct xsk_socket_info *xsk)
     }

     tx_done = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx_cq);
-    if (tx_done > 0) {
-        xsk_ring_cons__release(&xsk->umem->cq, tx_done);
-        xsk->outstanding_tx -= tx_done;
-    }

     /* Recycle back to umem pool */
     for (j = 0; j < tx_done; j++) {
         struct umem_elem *elem;
-        uint64_t addr;
+        uint64_t *addr;

-        addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx_cq++);
+        addr = xsk_ring_cons__comp_addr(&xsk->umem->cq, idx_cq++);
+        if (*addr == 0) {
+            continue;
+        }
         elem = ALIGNED_CAST(struct umem_elem *,
-                            (char *)xsk->umem->buffer + addr);
+                            (char *)xsk->umem->buffer + *addr);
         elems_push[j] = elem;
+        *addr = 0;
+        tx++;
     }

-    umem_elem_push_n(&xsk->umem->mpool, tx_done, (void **)elems_push);
+    umem_elem_push_n(&xsk->umem->mpool, tx, (void **)elems_push);
+
+    if (tx_done > 0) {
+        xsk_ring_cons__release(&xsk->umem->cq, tx_done);
+        xsk->outstanding_tx -= tx_done;
+    }
 }

 int

On Thu, Jun 13, 2019 at 12:17 AM Eelco Chaudron <echau...@redhat.com> wrote:
>
> On 13 Jun 2019, at 2:37, William Tu wrote:
>
> Hi Eelco,
>
> #0 0x00007fbc6a78193f in raise () from /lib64/libc.so.6
> #1 0x00007fbc6a76bc95 in abort () from /lib64/libc.so.6
> #2 0x00000000004ed1a1 in __umem_elem_push_n (addrs=0x7fbc40f2ec50,
> n=32, umemp=0x24cc790) at lib/xdpsock.c:32
> #3 umem_elem_push_n (umemp=0x24cc790, n=32,
>
> I've found that it's due to free the afxdp twice.
> The free_afxdp_buf() should be called once per dp_packet, somehow
> it gets called twice.
> Applying this on v11 fixes the issue
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -145,7 +145,7 @@ dp_packet_uninit(struct dp_packet *b)
> free_dpdk_buf((struct dp_packet*) b);
> #endif
> } else if (b->source == DPBUF_AFXDP) {
> - free_afxdp_buf(b);
> + ;
> }
> }
> }
>
> It’s still crashing for me, even with the change. I’m using a single nic, 
> same port in as out:
>
> @@ -122,6 +144,8 @@ dp_packet_uninit(struct dp_packet *b)
>               * created as a dp_packet */
>              free_dpdk_buf((struct dp_packet*) b);
>  #endif
> +        } else if (b->source == DPBUF_AFXDP) {
> +//            free_afxdp_buf(b);
>          }
>      }
>  }
>
> Core was generated by `ovs-vswitchd unix:/var/run/openvswitch/db.sock 
> -vconsole:emer -vsyslog:err -vfi'.
> Program terminated with signal SIGABRT, Aborted.
> #0  0x00007f62ef71193f in raise () from /lib64/libc.so.6
> [Current thread is 1 (Thread 0x7f62bdf33700 (LWP 24737))]
> Missing separate debuginfos, use: dnf debuginfo-install 
> elfutils-libelf-0.174-6.el8.x86_64 glibc-2.28-42.el8_0.1.x86_64 
> libatomic-8.2.1-3.5.el8.x86_64 libcap-ng-0.7.9-4.el8.x86_64 
> numactl-libs-2.0.12-2.el8.x86_64 openssl-libs-1.1.1-8.el8.x86_64 
> zlib-1.2.11-10.el8.x86_64
> (gdb) bt
> #0  0x00007f62ef71193f in raise () from /lib64/libc.so.6
> #1  0x00007f62ef6fbc95 in abort () from /lib64/libc.so.6
> #2  0x00000000004ed1a1 in __umem_elem_push_n (addrs=0x7f62bdf30c50, n=32, 
> umemp=0x2378660) at lib/xdpsock.c:32
> #3  umem_elem_push_n (umemp=0x2378660, n=32, 
> addrs=addrs@entry=0x7f62bdf30ea0) at lib/xdpsock.c:43
> #4  0x00000000009b4f41 in afxdp_complete_tx (xsk=0x2378250) at 
> lib/netdev-afxdp.c:736
> #5  netdev_afxdp_batch_send (netdev=<optimized out>, qid=0, 
> batch=0x7f62a4004e80, concurrent_txq=<optimized out>) at 
> lib/netdev-afxdp.c:763
> #6  0x0000000000908031 in netdev_send (netdev=<optimized out>, 
> qid=qid@entry=0, batch=batch@entry=0x7f62a4004e80, 
> concurrent_txq=concurrent_txq@entry=true) at lib/netdev.c:800
> #7  0x00000000008d4c24 in dp_netdev_pmd_flush_output_on_port 
> (pmd=pmd@entry=0x7f62bdf34010, p=p@entry=0x7f62a4004e50) at 
> lib/dpif-netdev.c:4187
> #8  0x00000000008d4f3f in dp_netdev_pmd_flush_output_packets 
> (pmd=pmd@entry=0x7f62bdf34010, force=force@entry=false) at 
> lib/dpif-netdev.c:4227
> #9  0x00000000008dd2d7 in dp_netdev_pmd_flush_output_packets (force=false, 
> pmd=0x7f62bdf34010) at lib/dpif-netdev.c:4282
> #10 dp_netdev_process_rxq_port (pmd=pmd@entry=0x7f62bdf34010, rxq=0x237e650, 
> port_no=1) at lib/dpif-netdev.c:4282
> #11 0x00000000008dd63d in pmd_thread_main (f_=<optimized out>) at 
> lib/dpif-netdev.c:5449
> #12 0x000000000095e94d in ovsthread_wrapper (aux_=<optimized out>) at 
> lib/ovs-thread.c:352
> #13 0x00007f62f00312de in start_thread () from /lib64/libpthread.so.0
> #14 0x00007f62ef7d6a63 in clone () from /lib64/libc.so.6
>
> Core was generated by `ovs-vswitchd unix:/var/run/openvswitch/db.sock 
> -vconsole:emer -vsyslog:err -vfi'.
> Program terminated with signal SIGSEGV, Segmentation fault.
> #0  netdev_afxdp_rxq_recv (rxq_=0x2bd75d0, batch=0x7f399f7fc0d0, qfill=0x0) 
> at lib/netdev-afxdp.c:583
> 583        rx->fd = xsk_socket__fd(xsk->xsk);
> [Current thread is 1 (Thread 0x7f399f7fe700 (LWP 24597))]
> Missing separate debuginfos, use: dnf debuginfo-install 
> elfutils-libelf-0.174-6.el8.x86_64 glibc-2.28-42.el8_0.1.x86_64 
> libatomic-8.2.1-3.5.el8.x86_64 libcap-ng-0.7.9-4.el8.x86_64 
> numactl-libs-2.0.12-2.el8.x86_64 openssl-libs-1.1.1-8.el8.x86_64 
> zlib-1.2.11-10.el8.x86_64
> (gdb) bt
> #0  netdev_afxdp_rxq_recv (rxq_=0x2bd75d0, batch=0x7f399f7fc0d0, qfill=0x0) 
> at lib/netdev-afxdp.c:583
> #1  0x0000000000907f21 in netdev_rxq_recv (rx=<optimized out>, 
> batch=batch@entry=0x7f399f7fc0d0, qfill=<optimized out>) at lib/netdev.c:710
> #2  0x00000000008dd1c3 in dp_netdev_process_rxq_port 
> (pmd=pmd@entry=0x2bf80c0, rxq=0x2bd63e0, port_no=2) at lib/dpif-netdev.c:4257
> #3  0x00000000008dd63d in pmd_thread_main (f_=<optimized out>) at 
> lib/dpif-netdev.c:5449
> #4  0x000000000095e94d in ovsthread_wrapper (aux_=<optimized out>) at 
> lib/ovs-thread.c:352
> #5  0x00007f39d86de2de in start_thread () from /lib64/libpthread.so.0
> #6  0x00007f39d7e83a63 in clone () from /lib64/libc.so.6
> (gdb) p xsk->xsk
> Cannot access memory at address 0x316f6ebd
> (gdb) p xsk
> $1 = (struct xsk_socket_info *) 0x316f6e65
>
> I will work on next version
> Thank you
> William
>
> <SNIP>
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to