Thanks for the report, I didn't realize that the callback could come in the
same thread.

I sent a patch that I believe should fix the deadlock here:

http://openvswitch.org/pipermail/dev/2016-August/077315.html

2016-08-05 7:48 GMT-07:00 Ilya Maximets <i.maxim...@samsung.com>:

> On 04.08.2016 12:49, Mark Kavanagh wrote:
> > DPDK v16.07 introduces the ability to free memzones.
> > Up until this point, DPDK memory pools created in OVS could
> > not be destroyed, thus incurring a memory leak.
> >
> > Leverage the DPDK v16.07 rte_mempool API to free DPDK
> > mempools when their associated reference count reaches 0 (this
> > indicates that the memory pool is no longer in use).
> >
> > Signed-off-by: Mark Kavanagh <mark.b.kavan...@intel.com>
> > ---
> >
> > v2->v1: rebase to head of master, and remove 'RFC' tag
> >
> >  lib/netdev-dpdk.c | 29 +++++++++++++++--------------
> >  1 file changed, 15 insertions(+), 14 deletions(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index aaac0d1..ffcd35c 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -506,7 +506,7 @@ dpdk_mp_get(int socket_id, int mtu)
> OVS_REQUIRES(dpdk_mutex)
> >  }
> >
> >  static void
> > -dpdk_mp_put(struct dpdk_mp *dmp)
> > +dpdk_mp_put(struct dpdk_mp *dmp) OVS_REQUIRES(dpdk_mutex)
> >  {
> >
> >      if (!dmp) {
> > @@ -514,15 +514,12 @@ dpdk_mp_put(struct dpdk_mp *dmp)
> >      }
> >
> >      dmp->refcount--;
> > -    ovs_assert(dmp->refcount >= 0);
> >
> > -#if 0
> > -    /* I could not find any API to destroy mp. */
> > -    if (dmp->refcount == 0) {
> > -        list_delete(dmp->list_node);
> > -        /* destroy mp-pool. */
> > -    }
> > -#endif
> > +    if (OVS_UNLIKELY(!dmp->refcount)) {
> > +        ovs_list_remove(&dmp->list_node);
> > +        rte_mempool_free(dmp->mp);
> > +     }
> > +
> >  }
> >
> >  static void
> > @@ -928,16 +925,18 @@ netdev_dpdk_destruct(struct netdev *netdev)
> >  {
> >      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> >
> > +    ovs_mutex_lock(&dpdk_mutex);
> >      ovs_mutex_lock(&dev->mutex);
> > +
> >      rte_eth_dev_stop(dev->port_id);
> >      free(ovsrcu_get_protected(struct ingress_policer *,
> >                                &dev->ingress_policer));
> > -    ovs_mutex_unlock(&dev->mutex);
> >
> > -    ovs_mutex_lock(&dpdk_mutex);
> >      rte_free(dev->tx_q);
> >      ovs_list_remove(&dev->list_node);
> >      dpdk_mp_put(dev->dpdk_mp);
> > +
> > +    ovs_mutex_unlock(&dev->mutex);
> >      ovs_mutex_unlock(&dpdk_mutex);
> >  }
> >
> > @@ -946,6 +945,9 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
> >  {
> >      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> >
> > +    ovs_mutex_lock(&dpdk_mutex);
> > +    ovs_mutex_lock(&dev->mutex);
> > +
> >      /* Guest becomes an orphan if still attached. */
> >      if (netdev_dpdk_get_vid(dev) >= 0) {
> >          VLOG_ERR("Removing port '%s' while vhost device still
> attached.",
> > @@ -961,15 +963,14 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
> >          fatal_signal_remove_file_to_unlink(dev->vhost_id);
> >      }
> >
> > -    ovs_mutex_lock(&dev->mutex);
> >      free(ovsrcu_get_protected(struct ingress_policer *,
> >                                &dev->ingress_policer));
> > -    ovs_mutex_unlock(&dev->mutex);
> >
> > -    ovs_mutex_lock(&dpdk_mutex);
> >      rte_free(dev->tx_q);
> >      ovs_list_remove(&dev->list_node);
> >      dpdk_mp_put(dev->dpdk_mp);
> > +
> > +    ovs_mutex_unlock(&dev->mutex);
> >      ovs_mutex_unlock(&dpdk_mutex);
> >  }
>
> I agree that locking here was wrong but this change introduces issue
> because
> 'rte_vhost_driver_unregister()' may call 'destroy_device()' and OVS will
> be aborted
> on attempt to lock 'dpdk_mutex' again:
>
> VHOST_CONFIG: free connfd = 37 for device '/vhost1'
> ovs-vswitchd: lib/netdev-dpdk.c:2305: pthread_mutex_lock failed (Resource
> deadlock avoided)
>
> Program received signal SIGABRT, Aborted.
> 0x0000007fb7ad6d38 in raise () from /lib64/libc.so.6
> (gdb) bt
> #0  0x0000007fb7ad6d38 in raise () from /lib64/libc.so.6
> #1  0x0000007fb7ad8aa8 in abort () from /lib64/libc.so.6
> #2  0x0000000000692be0 in ovs_abort_valist at lib/util.c:335
> #3  0x0000000000692ba0 in ovs_abort at lib/util.c:327
> #4  0x0000000000651800 in ovs_mutex_lock_at (l_=0x899ab0 <dpdk_mutex>,
> where=0x78a458 "lib/netdev-dpdk.c:2305") at lib/ovs-thread.c:76
> #5  0x00000000006c0190 in destroy_device (vid=0) at lib/netdev-dpdk.c:2305
> #6  0x00000000004ea850 in vhost_destroy_device ()
> #7  0x00000000004ee578 in rte_vhost_driver_unregister ()
> #8  0x00000000006bc8c8 in netdev_dpdk_vhost_destruct (netdev=0x7f6bffed00)
> at lib/netdev-dpdk.c:944
> #9  0x00000000005e4ad4 in netdev_unref (dev=0x7f6bffed00) at
> lib/netdev.c:499
> #10 0x00000000005e4b9c in netdev_close (netdev=0x7f6bffed00) at
> lib/netdev.c:523
> [...]
> #20 0x000000000053ad94 in main (argc=7, argv=0x7ffffff318) at
> vswitchd/ovs-vswitchd.c:112
>
> May be reproduced by removing port while virtio still attached.
> This blocks reconnection feature and deletion of port while QEMU still
> attached.
>
> Someone should fix this. Any thoughts?
>
> Best regards, Ilya Maximets.
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to