On 11/16/2016 10:14 PM, Parthasarathy Bhuvaragan wrote:
> Commit 333f796235a527 ("tipc: fix a race condition leading to
> subscriber refcnt bug") reveals a soft lockup while acquiring
> nametbl_lock.

Please explain why commit 333f796235a527 introduces the soft lockup.

>
> The soft lock is the call chain of tipc_nametbl_withdraw(), when it
> performs the tipc_conn_kref_release() as it tries to grab nametbl_lock
> again while holding it already.
> tipc_nametbl_withdraw
>   spin_lock_bh(&tn->nametbl_lock);
>   tipc_nametbl_remove_publ
>      tipc_nameseq_remove_publ
>        tipc_subscrp_report_overlap
>          tipc_subscrp_send_event
>             tipc_conn_sendmsg
> Here, the (test_bit(CF_CONNECTED, &con->flags)) fails, leading to the
> else case where do do a conn_put() and that triggers the cleanup as
> refcount reached 0. Leading the call chain below:
> tipc_conn_kref_release
>     tipc_sock_release
>       tipc_conn_release
>          tipc_subscrb_delete
>             tipc_subscrp_delete
>                tipc_nametbl_unsubscribe
>                   spin_lock_bh(&tn->nametbl_lock);  << Soft Lockup >>
>
> The above lockup is caused due to two reasons:
> 1. During module exit tipc_exit_net() first calls tipc_topsrv_stop()
>    followed by tipc_nametbl_withdraw() in scope of tipc_net_stop().
> 2. tipc_server_stop() grabs and releases the idr_lock twice for every
>    connection as it needs to execute functions that might sleep.
>    The first lock is to find the connection and the second sets the
>    connection flag, removes it from list and decrements the refcount.
>    There is a possibility for another thread to grab the connection
>    in between the two, thus owning the connection and triggering the
>    cleanup while decrementing the refcount later.
>
> In this commit, we perform:
> - call tipc_nametbl_withdraw() before tipc_topsrv_stop()
> - In tipc_server_stop():
>   we remove the connection id's of all connections from connection
>   list in the scope of the idr_lock.
>   copy s->idr_in_use to stack, as this is being updated in the loop.
>   Thus no other thread will find a connection which has unset
>   CF_CONNECTED in its flags.
>
> Fixes: 333f796235a52727 ("tipc: fix a race condition leading to subscriber 
> refcnt bug")
> Signed-off-by: Parthasarathy Bhuvaragan 
> <parthasarathy.bhuvara...@ericsson.com>
> ---
>  net/tipc/core.c   |  4 ++++
>  net/tipc/net.c    |  2 --
>  net/tipc/server.c | 19 +++++++++++++++----
>  3 files changed, 19 insertions(+), 6 deletions(-)
>
> diff --git a/net/tipc/core.c b/net/tipc/core.c
> index 236b043a4156..3ea1452e2f06 100644
> --- a/net/tipc/core.c
> +++ b/net/tipc/core.c
> @@ -93,6 +93,10 @@ static int __net_init tipc_init_net(struct net *net)
>
>  static void __net_exit tipc_exit_net(struct net *net)
>  {
> +     struct tipc_net *tn = net_generic(net, tipc_net_id);
> +
> +     tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0,
> +                           tn->own_addr);

I could not understand why we move tipc_nametbl_withdraw to here.

>       tipc_topsrv_stop(net);
>       tipc_net_stop(net);
>       tipc_bcast_stop(net);
> diff --git a/net/tipc/net.c b/net/tipc/net.c
> index 28bf4feeb81c..92696cc6e763 100644
> --- a/net/tipc/net.c
> +++ b/net/tipc/net.c
> @@ -130,8 +130,6 @@ void tipc_net_stop(struct net *net)
>       if (!tn->own_addr)
>               return;
>
> -     tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0,
> -                           tn->own_addr);
>       rtnl_lock();
>       tipc_bearer_stop(net);
>       tipc_node_stop(net);
> diff --git a/net/tipc/server.c b/net/tipc/server.c
> index 215849ce453d..22a4caf66065 100644
> --- a/net/tipc/server.c
> +++ b/net/tipc/server.c
> @@ -622,22 +622,33 @@ int tipc_server_start(struct tipc_server *s)
>
>  void tipc_server_stop(struct tipc_server *s)
>  {
> +     int max_in_use = s->idr_in_use;
>       struct tipc_conn *con;
>       int total = 0;
>       int id;
>
>       spin_lock_bh(&s->idr_lock);
> -     for (id = 0; total < s->idr_in_use; id++) {
> +     for (id = 0; total < max_in_use; id++) {
>               con = idr_find(&s->conn_idr, id);
>               if (con) {
>                       total++;
> -                     spin_unlock_bh(&s->idr_lock);
> -                     tipc_close_conn(con);
> -                     spin_lock_bh(&s->idr_lock);

In fact the change is unnecessarily so complex at all, instead we can do 
this like:

spin_lock_bh(&s->idr_lock);
for (id = 0; total < s->idr_in_use; id++) {
        con = idr_find(&s->conn_idr, id);
        if (con) {
                total++;
                if (test_and_clear_bit(CF_CONNECTED, &con->flags))
                        idr_remove(&s->conn_idr, con->conid);
                        s->idr_in_use--;
                        spin_unlock_bh(&s->idr_lock);
                        kernel_sock_shutdown(con->sock, SHUT_RDWR);
                        conn_put(con);
                         spin_lock_bh(&s->idr_lock);
                 }
         }
}
spin_unlock_bh(&s->idr_lock);

Regards,
Ying

> +                     if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
> +                             idr_remove(&s->conn_idr, con->conid);
> +                             s->idr_in_use--;
> +                     }
>               }
>       }
>       spin_unlock_bh(&s->idr_lock);
>
> +     for (id = 0; total < max_in_use; id++) {
> +             con = idr_find(&s->conn_idr, id);
> +             if (con) {
> +                     total++;
> +                     kernel_sock_shutdown(con->sock, SHUT_RDWR);
> +                     conn_put(con);
> +             }
> +     }
> +
>       tipc_work_stop(s);
>       kmem_cache_destroy(s->rcvbuf_cache);
>       idr_destroy(&s->conn_idr);
>


------------------------------------------------------------------------------
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

Reply via email to