[Fwd: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock]

2007-04-05 Thread Ben Greear
Please consider this for the 2.6.20 stable branch.  This fixes a 
deadlock between the
bridging code and the 8139too driver.  I am not the author of this 
patch, but I have
tested a slightly modified version (so that it works with the 2.6.18 
kernel) extensively

and it solves the deadlock.  Mr. Romieu suggested I forward this to you...

Thanks,
Ben

 Original Message 
Subject:[PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock
Date:   Thu, 15 Feb 2007 23:37:44 +0100
From:   Francois Romieu <[EMAIL PROTECTED]>
To: [EMAIL PROTECTED]
CC: 	Stephen Hemminger <[EMAIL PROTECTED]>, 
[EMAIL PROTECTED], netdev@vger.kernel.org, Ben Greear 
<[EMAIL PROTECTED]>, Kyle Lucke <[EMAIL PROTECTED]>, Raghavendra 
Koushik <[EMAIL PROTECTED]>, Al Viro <[EMAIL PROTECTED]>




Your usual dont-flush_scheduled_work-with-RTNL-held stuff.

It is a bit different here since the thread runs permanently
or is only occasionally kicked for recovery depending on the
hardware revision.

Signed-off-by: Francois Romieu <[EMAIL PROTECTED]>
---
drivers/net/8139too.c |   40 +---
1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 35ad5cf..99304b2 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -1109,6 +1109,8 @@ static void __devexit rtl8139_remove_one (struct pci_dev 
*pdev)

assert (dev != NULL);

+   flush_scheduled_work();
+
unregister_netdev (dev);

__rtl8139_cleanup_dev (dev);
@@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct *work)
struct net_device *dev = tp->mii.dev;
unsigned long thr_delay = next_tick;

+   rtnl_lock();
+
+   if (!netif_running(dev))
+   goto out_unlock;
+
if (tp->watchdog_fired) {
tp->watchdog_fired = 0;
rtl8139_tx_timeout_task(work);
-   } else if (rtnl_trylock()) {
-   rtl8139_thread_iter (dev, tp, tp->mmio_addr);
-   rtnl_unlock ();
-   } else {
-   /* unlikely race.  mitigate with fast poll. */
-   thr_delay = HZ / 2;
-   }
+   } else
+   rtl8139_thread_iter(dev, tp, tp->mmio_addr);

-   schedule_delayed_work(&tp->thread, thr_delay);
+   if (tp->have_thread)
+   schedule_delayed_work(&tp->thread, thr_delay);
+out_unlock:
+   rtnl_unlock ();
}

static void rtl8139_start_thread(struct rtl8139_private *tp)
@@ -1626,19 +1631,11 @@ static void rtl8139_start_thread(struct rtl8139_private 
*tp)
return;

tp->have_thread = 1;
+   tp->watchdog_fired = 0;

schedule_delayed_work(&tp->thread, next_tick);
}

-static void rtl8139_stop_thread(struct rtl8139_private *tp)
-{
-   if (tp->have_thread) {
-   cancel_rearming_delayed_work(&tp->thread);
-   tp->have_thread = 0;
-   } else
-   flush_scheduled_work();
-}
-
static inline void rtl8139_tx_clear (struct rtl8139_private *tp)
{
tp->cur_tx = 0;
@@ -1696,12 +1693,11 @@ static void rtl8139_tx_timeout (struct net_device *dev)
{
struct rtl8139_private *tp = netdev_priv(dev);

+   tp->watchdog_fired = 1;
if (!tp->have_thread) {
-   INIT_DELAYED_WORK(&tp->thread, rtl8139_tx_timeout_task);
+   INIT_DELAYED_WORK(&tp->thread, rtl8139_thread);
schedule_delayed_work(&tp->thread, next_tick);
-   } else
-   tp->watchdog_fired = 1;
-
+   }
}

static int rtl8139_start_xmit (struct sk_buff *skb, struct net_device *dev)
@@ -2233,8 +2229,6 @@ static int rtl8139_close (struct net_device *dev)

netif_stop_queue (dev);

-   rtl8139_stop_thread(tp);
-
if (netif_msg_ifdown(tp))
printk(KERN_DEBUG "%s: Shutting down ethercard, status was 
0x%4.4x.\n",
dev->name, RTL_R16 (IntrStatus));
--
1.4.4.4

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
Ben Greear <[EMAIL PROTECTED]> 
Candela Technologies Inc  http://www.candelatech.com



-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-04-05 Thread Francois Romieu
Ben Greear <[EMAIL PROTECTED]> :
[...]
> It looks like this has not made it into the 2.6.20 stable series
> patches...  Any reason not to add it there?

No. Go ahead and submit it.

-- 
Ueimor

Anybody got a battery for my Ultra 10 ?
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-04-04 Thread Ben Greear

Francois Romieu wrote:

Your usual dont-flush_scheduled_work-with-RTNL-held stuff.

It is a bit different here since the thread runs permanently
or is only occasionally kicked for recovery depending on the
hardware revision.


It looks like this has not made it into the 2.6.20 stable series
patches...  Any reason not to add it there?

Thanks,
Ben

--
Ben Greear <[EMAIL PROTECTED]>
Candela Technologies Inc  http://www.candelatech.com

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-19 Thread Francois Romieu
Cc: list trimmed.

Jarek Poplawski <[EMAIL PROTECTED]> :
> On Fri, Feb 16, 2007 at 09:20:34PM +0100, Francois Romieu wrote:
[...]
> > Btw, the thread runs every 3*HZ at most.
> 
> You are right (mostly)! But I think rtnl_lock is special
> and should be spared (even this 3*HZ) and here it's used
> for some mainly internal purpose (close synchronization).
> And it looks like mainly for this internal reason holding
> of rtnl_lock is increased. And because rtnl_lock is quite
> popular you have to take into consideration that after
> this 3*HZ it could spend some time waiting for the lock.
> So, maybe it would be nicer to check this netif_running
> twice (after rtnl_lock where needed), but maybe it's a
> mater of taste only, and yours is better, as well.

The region protected by RTNL has been widened to include a tx_timeout
handler. It is supposed to handle an occasional error, something that
should not even happen at 3*HZ. Optimizing it is useless, especially
on an high-end performer like the 8139.

> (Btw. I didn't verify this, but I hope you checked that
> places not under rtnl_lock before the patch are safe from
> some locking problems now.)

I did. It is not a reason to trust the patch though.

-- 
Ueimor
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-19 Thread Jarek Poplawski
On Fri, Feb 16, 2007 at 09:20:34PM +0100, Francois Romieu wrote:
> Jarek Poplawski <[EMAIL PROTECTED]> :
...
> > > @@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct 
> > > *work)
> > >   struct net_device *dev = tp->mii.dev;
> > >   unsigned long thr_delay = next_tick;
> > >  
> > > + rtnl_lock();
> > > +
> > > + if (!netif_running(dev))
> > > + goto out_unlock;
> > 
> > I wonder, why you don't do netif_running before
> > rtnl_lock ? It's an atomic operation. And I'm not sure if increasing
> > rtnl_lock range is really needed here.
> 
> threadA: netif_running()
> user task B: rtnl_lock()
> user task B: dev->close()
> user task B: rtnl_unlock()
> threadA: rtnl_lock()
> threadA: mess with closed device
> 
> Btw, the thread runs every 3*HZ at most.

You are right (mostly)! But I think rtnl_lock is special
and should be spared (even this 3*HZ) and here it's used
for some mainly internal purpose (close synchronization).
And it looks like mainly for this internal reason holding
of rtnl_lock is increased. And because rtnl_lock is quite
popular you have to take into consideration that after
this 3*HZ it could spend some time waiting for the lock.
So, maybe it would be nicer to check this netif_running
twice (after rtnl_lock where needed), but maybe it's a
mater of taste only, and yours is better, as well.
(Btw. I didn't verify this, but I hope you checked that
places not under rtnl_lock before the patch are safe from
some locking problems now.)

Jarek P.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-17 Thread Francois Romieu
Stephen Hemminger <[EMAIL PROTECTED]> :
[...]
> You need to hold a dev reference (dev_hold) as well. to keep the device
> from disappearing. or do a flush_scheduled_work in the remove routine.

The patched drivers do #2 (before unregister_netdev).

-- 
Ueimor
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-16 Thread Stephen Hemminger
On Fri, 16 Feb 2007 21:20:34 +0100
Francois Romieu <[EMAIL PROTECTED]> wrote:

> Jarek Poplawski <[EMAIL PROTECTED]> :
> [...]
> > > diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
> > > index 35ad5cf..99304b2 100644
> > > --- a/drivers/net/8139too.c
> > > +++ b/drivers/net/8139too.c
> > > @@ -1109,6 +1109,8 @@ static void __devexit rtl8139_remove_one (struct 
> > > pci_dev *pdev)
> > >  
> > >   assert (dev != NULL);
> > >  
> > > + flush_scheduled_work();
> > > +
> > 
> > IMHO there should be rather cancel_rearming_delayed_work
> > instead of this.
> 
> The delayed_work is initialized even if tp->have_thread is false,
> so cancel_rearming_delayed_work() will work, yes. Feel free to
> send a patch.
> 
> [...]
> > > @@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct 
> > > *work)
> > >   struct net_device *dev = tp->mii.dev;
> > >   unsigned long thr_delay = next_tick;
> > >  
> > > + rtnl_lock();
> > > +
> > > + if (!netif_running(dev))
> > > + goto out_unlock;
> > 
> > I wonder, why you don't do netif_running before
> > rtnl_lock ? It's an atomic operation. And I'm not sure if increasing
> > rtnl_lock range is really needed here.
> 
> threadA: netif_running()
> user task B: rtnl_lock()
> user task B: dev->close()
> user task B: rtnl_unlock()
> threadA: rtnl_lock()
> threadA: mess with closed device
> 
> Btw, the thread runs every 3*HZ at most.

You need to hold a dev reference (dev_hold) as well. to keep the device
from disappearing. or do a flush_scheduled_work in the remove routine.

-- 
Stephen Hemminger <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-16 Thread Francois Romieu
Jarek Poplawski <[EMAIL PROTECTED]> :
[...]
> > diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
> > index 35ad5cf..99304b2 100644
> > --- a/drivers/net/8139too.c
> > +++ b/drivers/net/8139too.c
> > @@ -1109,6 +1109,8 @@ static void __devexit rtl8139_remove_one (struct 
> > pci_dev *pdev)
> >  
> > assert (dev != NULL);
> >  
> > +   flush_scheduled_work();
> > +
> 
> IMHO there should be rather cancel_rearming_delayed_work
> instead of this.

The delayed_work is initialized even if tp->have_thread is false,
so cancel_rearming_delayed_work() will work, yes. Feel free to
send a patch.

[...]
> > @@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct 
> > *work)
> > struct net_device *dev = tp->mii.dev;
> > unsigned long thr_delay = next_tick;
> >  
> > +   rtnl_lock();
> > +
> > +   if (!netif_running(dev))
> > +   goto out_unlock;
> 
> I wonder, why you don't do netif_running before
> rtnl_lock ? It's an atomic operation. And I'm not sure if increasing
> rtnl_lock range is really needed here.

threadA: netif_running()
user task B: rtnl_lock()
user task B: dev->close()
user task B: rtnl_unlock()
threadA: rtnl_lock()
threadA: mess with closed device

Btw, the thread runs every 3*HZ at most.

-- 
Ueimor
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-15 Thread Jarek Poplawski
On 15-02-2007 23:37, Francois Romieu wrote:
> Your usual dont-flush_scheduled_work-with-RTNL-held stuff.
> 
> It is a bit different here since the thread runs permanently
> or is only occasionally kicked for recovery depending on the
> hardware revision.
> 
> Signed-off-by: Francois Romieu <[EMAIL PROTECTED]>
> ---
>  drivers/net/8139too.c |   40 +---
>  1 files changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
> index 35ad5cf..99304b2 100644
> --- a/drivers/net/8139too.c
> +++ b/drivers/net/8139too.c
> @@ -1109,6 +1109,8 @@ static void __devexit rtl8139_remove_one (struct 
> pci_dev *pdev)
>  
>   assert (dev != NULL);
>  
> + flush_scheduled_work();
> +

IMHO there should be rather cancel_rearming_delayed_work
instead of this.

>   unregister_netdev (dev);
>  
>   __rtl8139_cleanup_dev (dev);
> @@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct *work)
>   struct net_device *dev = tp->mii.dev;
>   unsigned long thr_delay = next_tick;
>  
> + rtnl_lock();
> +
> + if (!netif_running(dev))
> + goto out_unlock;

I wonder, why you don't do netif_running before
rtnl_lock? It's an atomic operation.

And I'm not sure if increasing rtnl_lock range
is really needed here.

Regards,
Jarek P.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] 8139too: RTNL and flush_scheduled_work deadlock

2007-02-15 Thread Francois Romieu
Your usual dont-flush_scheduled_work-with-RTNL-held stuff.

It is a bit different here since the thread runs permanently
or is only occasionally kicked for recovery depending on the
hardware revision.

Signed-off-by: Francois Romieu <[EMAIL PROTECTED]>
---
 drivers/net/8139too.c |   40 +---
 1 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 35ad5cf..99304b2 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -1109,6 +1109,8 @@ static void __devexit rtl8139_remove_one (struct pci_dev 
*pdev)
 
assert (dev != NULL);
 
+   flush_scheduled_work();
+
unregister_netdev (dev);
 
__rtl8139_cleanup_dev (dev);
@@ -1603,18 +1605,21 @@ static void rtl8139_thread (struct work_struct *work)
struct net_device *dev = tp->mii.dev;
unsigned long thr_delay = next_tick;
 
+   rtnl_lock();
+
+   if (!netif_running(dev))
+   goto out_unlock;
+
if (tp->watchdog_fired) {
tp->watchdog_fired = 0;
rtl8139_tx_timeout_task(work);
-   } else if (rtnl_trylock()) {
-   rtl8139_thread_iter (dev, tp, tp->mmio_addr);
-   rtnl_unlock ();
-   } else {
-   /* unlikely race.  mitigate with fast poll. */
-   thr_delay = HZ / 2;
-   }
+   } else
+   rtl8139_thread_iter(dev, tp, tp->mmio_addr);
 
-   schedule_delayed_work(&tp->thread, thr_delay);
+   if (tp->have_thread)
+   schedule_delayed_work(&tp->thread, thr_delay);
+out_unlock:
+   rtnl_unlock ();
 }
 
 static void rtl8139_start_thread(struct rtl8139_private *tp)
@@ -1626,19 +1631,11 @@ static void rtl8139_start_thread(struct rtl8139_private 
*tp)
return;
 
tp->have_thread = 1;
+   tp->watchdog_fired = 0;
 
schedule_delayed_work(&tp->thread, next_tick);
 }
 
-static void rtl8139_stop_thread(struct rtl8139_private *tp)
-{
-   if (tp->have_thread) {
-   cancel_rearming_delayed_work(&tp->thread);
-   tp->have_thread = 0;
-   } else
-   flush_scheduled_work();
-}
-
 static inline void rtl8139_tx_clear (struct rtl8139_private *tp)
 {
tp->cur_tx = 0;
@@ -1696,12 +1693,11 @@ static void rtl8139_tx_timeout (struct net_device *dev)
 {
struct rtl8139_private *tp = netdev_priv(dev);
 
+   tp->watchdog_fired = 1;
if (!tp->have_thread) {
-   INIT_DELAYED_WORK(&tp->thread, rtl8139_tx_timeout_task);
+   INIT_DELAYED_WORK(&tp->thread, rtl8139_thread);
schedule_delayed_work(&tp->thread, next_tick);
-   } else
-   tp->watchdog_fired = 1;
-
+   }
 }
 
 static int rtl8139_start_xmit (struct sk_buff *skb, struct net_device *dev)
@@ -2233,8 +2229,6 @@ static int rtl8139_close (struct net_device *dev)
 
netif_stop_queue (dev);
 
-   rtl8139_stop_thread(tp);
-
if (netif_msg_ifdown(tp))
printk(KERN_DEBUG "%s: Shutting down ethercard, status was 
0x%4.4x.\n",
dev->name, RTL_R16 (IntrStatus));
-- 
1.4.4.4

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html