Re: [PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-10 Thread Mathias Nyman

On 05.02.2016 17:14, Chris Bainbridge wrote:

Running task list at fail point:


...


Some of the functions appear to be inlined, the exact call chain is:

hub_port_init
 usb_get_device_descriptor
 usb_get_descriptor
 usb_control_msg
 usb_internal_control_msg
 usb_start_wait_urb
 usb_submit_urb / wait_for_completion_timeout / 
usb_kill_urb

hub_port_init
 hub_set_address
 xhci_address_device
 xhci_setup_device



hub_port_reset() will end up moving the corresponding xhci device slot to 
default state.

As hub_port_reset() is called several times in hub_port_init() It sounds 
reasonable
that we could end up with two threads having their xhci device slots in default 
state at
the same time,  which according to xhci 4.5.3 specs still is a big no no.

So both threads fail at their next task after this.
One fails to read the descriptor, and the other fails addressing the device.

Nice catch btw.


So xhci_setup_device is entered while there is an outstanding URB on the
other bus. Unless anyone can think of a better way to fix this I'll make
the requested changes and resend my patch.



For what it's wort I think that this suggested controller mutex sounds like a 
good idea.
Should work for xhci at least.

-Mathias





Re: [PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-05 Thread Chris Bainbridge
Running task list at fail point:

[8.978405] kworker/3:0 R  running task1176024  2 0x
[8.979624] Workqueue: usb_hub_wq hub_event
[8.980831]  880260613af8 880260613ac0 811247ed 
8802604597c0
[8.982061]  880260608000 880260614000 0100 
880260613b70
[8.983280]  880260613c14 1388 880260613b10 
81b9bab7
[8.984500] Call Trace:
[8.985698]  [] schedule+0x37/0x90
[8.986918]  [] usb_kill_urb+0x8d/0xd0
[8.988130]  [] ? wake_up_atomic_t+0x30/0x30
[8.989343]  [] usb_start_wait_urb+0xbe/0x150
[8.990561]  [] usb_control_msg+0xbc/0xf0
[8.991766]  [] hub_port_init+0x51e/0xb70
[8.992964]  [] hub_event+0x817/0x1570
[8.994156]  [] process_one_work+0x1ff/0x620
[8.995342]  [] ? process_one_work+0x15f/0x620
[8.996528]  [] worker_thread+0x64/0x4b0
[8.997707]  [] ? rescuer_thread+0x390/0x390  
[8.998883]  [] kthread+0x105/0x120
[9.56]  [] ? kthread_create_on_node+0x200/0x200
[9.001241]  [] ret_from_fork+0x3f/0x70
[9.002420]  [] ? kthread_create_on_node+0x200/0x200

[9.094152] kworker/3:1 R  running task11496   238  2 0x
[9.095361] Workqueue: pm pm_runtime_work
[9.096545]  88025fc13ae0 88025fc13aa8 0003811247ed 
880260bc
[9.097767]  8802603aaf80 88025fc14000 88025cc7a520 
88007c4b7978
[9.098986]  88025cc7a520 88007c4b7978 88025fc13af8 
81b9bab7
[9.100213] Call Trace:
[9.101415]  [] schedule+0x37/0x90
[9.102626]  [] usb_kill_urb+0x8d/0xd0
[9.103835]  [] ? wake_up_atomic_t+0x30/0x30
[9.105035]  [] hub_quiesce+0x6b/0xa0
[9.106226]  [] hub_suspend+0x12a/0x250
[9.107417]  [] usb_suspend_both+0x95/0x1d0
[9.108600]  [] usb_runtime_suspend+0x2e/0x70
[9.109783]  [] ? usb_probe_interface+0x310/0x310
[9.110958]  [] __rpm_callback+0x2d/0x70
[9.112130]  [] ? usb_probe_interface+0x310/0x310
[9.113305]  [] rpm_callback+0x1d/0x90
[9.114482]  [] rpm_suspend+0x14b/0x750
[9.115664]  [] __pm_runtime_suspend+0x57/0x90
[9.116852]  [] ? usb_runtime_resume+0x20/0x20
[9.118034]  [] usb_runtime_idle+0x25/0x30
[9.119210]  [] __rpm_callback+0x2d/0x70
[9.120382]  [] rpm_idle+0x221/0x410
[9.121541]  [] pm_runtime_work+0xa9/0xc0
[9.122693]  [] process_one_work+0x1ff/0x620
[9.123850]  [] ? process_one_work+0x15f/0x620
[9.124993]  [] worker_thread+0x64/0x4b0
[9.126134]  [] ? rescuer_thread+0x390/0x390
[9.127272]  [] ? rescuer_thread+0x390/0x390  
[9.128403]  [] kthread+0x105/0x120
[9.129524]  [] ? kthread_create_on_node+0x200/0x200
[9.130653]  [] ret_from_fork+0x3f/0x70
[9.131781]  [] ? kthread_create_on_node+0x200/0x200

[9.864737] kworker/3:2 R  running task11888  1348  2 0x0008
[9.865947] Workqueue: usb_hub_wq hub_event
[9.867152]  88025dbcdf00 88025dbd3b88 81109a8c 
811099a9
[9.868386]  88025dbcdf00  88025dbce270 
88025dbd3bb8
[9.869610]  81109c04 88025dac8000 88025dac8368 
88025c8a4000
[9.870837] Call Trace:
[9.872044]  [] sched_show_task+0x15c/0x260
[9.873248]  [] ? sched_show_task+0x79/0x260
[9.874456]  [] show_state_filter+0x74/0xc0   
[9.875664]  [] xhci_setup_device+0x53d/0xa40 
[9.876871]  [] xhci_address_device+0xe/0x10
[9.878068]  [] hub_port_init+0x1bf/0xb70
[9.879262]  [] ? trace_hardirqs_on+0xd/0x10
[9.880465]  [] hub_event+0x817/0x1570
[9.881668]  [] process_one_work+0x1ff/0x620
[9.882869]  [] ? process_one_work+0x15f/0x620
[9.884074]  [] worker_thread+0x64/0x4b0
[9.885268]  [] ? rescuer_thread+0x390/0x390
[9.886457]  [] kthread+0x105/0x120
[9.887634]  [] ? kthread_create_on_node+0x200/0x200
[9.17]  [] ret_from_fork+0x3f/0x70
[9.889995]  [] ? kthread_create_on_node+0x200/0x200

[9.891174] kworker/3:3 R  running task10808  1350  2 0x
[9.892367] Workqueue: events_freezable usb_stor_scan_dwork
[9.893560]  88025dbdbb78 88026a0ccd80 6a0ccd80 
8802604597c0
[9.894773]  88025dbcc740 88025dbdc000 88025dbdbbb0 
88026a0ccd80
[9.896004]  88026a0ccd80 0003 88025dbdbb90 
81b9bab7
[9.897233] Call Trace:
[9.898444]  [] schedule+0x37/0x90
[9.899670]  [] schedule_timeout+0x17c/0x2f0
[9.900893]  [] ? init_timer_on_stack_key+0x40/0x40
[9.902099]  [] 
wait_for_completion_interruptible_timeout+0xcd/0x160
[9.903324]  [] ? wake_up_q+0x70/0x70
[9.904536]  [] usb_stor_msg_common+0xd8/0x130
[9.905751]  [] usb_stor_control_msg+0x96/0xb0
[9.906961]  [] usb_stor_Bulk_max_lun+0x51/0xa0
[9.908163]  [] usb_stor_scan_dwork+0x7f/0xe0 
[9.909361]  [] process_one_work+0x1ff/0x620
[9.910563]  [] ? process_one_work+0x15f/0x620
[9.911762]  [] worker_thread+0x64/0x4b0
[

Re: [PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-04 Thread Alan Stern
On Thu, 4 Feb 2016, Chris Bainbridge wrote:

> On Thu, Feb 04, 2016 at 04:00:51PM -0500, Alan Stern wrote:
> > On Thu, 4 Feb 2016, Chris Bainbridge wrote:
> > 
> > > The XHCI controller presents two USB buses to the system - one for USB 2
> > > and one for USB 3. When only one bus is locked there is a race condition
> > > during hub init that results in errors like:
> > > 
> > >  [   13.183701] usb 3-3: device descriptor read/all, error -110
> > 
> > What exactly is the race condition?  Why does locking both buses fix 
> > it?

...

> hub_port_init is called in parallel for both buses.
> The first thread is in usb_get_device_descriptor when the second one
> enters the function and calls the code to get an address. I don't know
> precisely how it fails - it looks like the functions for doing the
> initialisation are synchronous and sleeping waiting for a response and
> that gets disrupted when the second thread tries to initialise the hub.
> What was the basis for using a lock on the bus rather than the
> controller?

I don't remember exactly.  At the time the code was written, there was
no important distinction between a bus and a controller.  This was long
before USB-3 appeared.

When USB-3 support was added, the basis for keeping the lock on the bus 
was that I assumed there would be no problem talking to different 
devices at address 0 if they were on different buses.

>  Does the spec say that buses of the same controller can be
> initialised in parallel? Mathias previously said:
> 
> > Just found an additional note in the xhci specs section 4.5.3 saying that:
> > "Note: Software shall not transition more than one Device Slot to the 
> > Default State at a time"
> > which is what xhci_setup_device() does in addition to moving slots to the 
> > addressed state
> 
> But I don't know if that means you can do the reset/set address/read
> descriptors in parallel?

In fact you can do the Set-Address and Read-Descriptor parts in
parallel.  (In USB-3 the Set-Address thing is a no-op anyhow; the
hardware takes over that role completely and does it during the reset.)  
But that quote from the spec implies that the resets must not be done
in parallel.

> > I don't think this is a good idea.  The driver core needs to be able to
> > access the controller while this function is running.  You can
> > introduce a new mutex if you want, perhaps in the primary hcd
> > structure, but don't use bus->controller->mutex.
> 
> An explicit lock might be a good idea. I was trying to avoid adding
> another lock so used the one in struct device as it appeared unused.

It gets used by the driver core.  Don't worry about the overhead of 
adding a new lock if it really is needed; the number of USB buses or 
controllers on any computer isn't big enough to matter.

> The
> XHCI code seems to only use the lock in struct xhci_hcd and ehci uses
> struct ehci->lock.
> 
> btw I think this bug may be the same as reported at
> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1437492

It could well be.

Alan Stern



Re: [PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-04 Thread Chris Bainbridge
On Thu, Feb 04, 2016 at 04:00:51PM -0500, Alan Stern wrote:
> On Thu, 4 Feb 2016, Chris Bainbridge wrote:
> 
> > The XHCI controller presents two USB buses to the system - one for USB 2
> > and one for USB 3. When only one bus is locked there is a race condition
> > during hub init that results in errors like:
> > 
> >  [   13.183701] usb 3-3: device descriptor read/all, error -110
> 
> What exactly is the race condition?  Why does locking both buses fix 
> it?

[2.692571] xhci_hcd :00:14.0: xHCI Host Controller
[2.693279] xhci_hcd :00:14.0: new USB bus registered, assigned bus 
number 3
[2.694867] xhci_hcd :00:14.0: hcc params 0x20007181 hci version 0x100 
quirks 0xb930
[2.694880] xhci_hcd :00:14.0: cache line size of 256 is not supported
[2.695995] usb usb3: New USB device found, idVendor=1d6b, idProduct=0002
[2.696005] usb usb3: New USB device strings: Mfr=3, Product=2, 
SerialNumber=1
[2.696016] usb usb3: Product: xHCI Host Controller
[2.696024] usb usb3: Manufacturer: Linux 4.4.0 xhci-hcd
[2.696031] usb usb3: SerialNumber: :00:14.0
[2.698414] hub 3-0:1.0: USB hub found
[2.704723] xhci_hcd :00:14.0: xHCI Host Controller
[2.705502] xhci_hcd :00:14.0: new USB bus registered, assigned bus 
number 4
[2.706136] usb usb4: New USB device found, idVendor=1d6b, idProduct=0003
[2.706138] usb usb4: New USB device strings: Mfr=3, Product=2, 
SerialNumber=1
[2.706139] usb usb4: Product: xHCI Host Controller
[2.706140] usb usb4: Manufacturer: Linux 4.4.0 xhci-hcd
[2.706141] usb usb4: SerialNumber: :00:14.0
[2.708467] hub 4-0:1.0: USB hub found
[3.021779] usb 3-3: new high-speed USB device number 2 using xhci_hcd
[8.034843] xhci_hcd :00:14.0: Timeout while waiting for setup device 
command
...bus4 enumeration...
[   13.297835] usb 3-3: device descriptor read/all, error -110
...bus3 enumeration...

Note there seem to be two timeouts of 5 seconds/10 seconds after the
last message at 3.021779 seconds.

hub_port_init is called in parallel for both buses.
The first thread is in usb_get_device_descriptor when the second one
enters the function and calls the code to get an address. I don't know
precisely how it fails - it looks like the functions for doing the
initialisation are synchronous and sleeping waiting for a response and
that gets disrupted when the second thread tries to initialise the hub.
What was the basis for using a lock on the bus rather than the
controller?  Does the spec say that buses of the same controller can be
initialised in parallel? Mathias previously said:

> Just found an additional note in the xhci specs section 4.5.3 saying that:
> "Note: Software shall not transition more than one Device Slot to the Default 
> State at a time"
> which is what xhci_setup_device() does in addition to moving slots to the 
> addressed state

But I don't know if that means you can do the reset/set address/read
descriptors in parallel?


> > @@ -4312,7 +4312,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device 
> > *udev, int port1,
> > if (oldspeed == USB_SPEED_LOW)
> > delay = HUB_LONG_RESET_TIME;
> >  
> > -   mutex_lock(&hdev->bus->usb_address0_mutex);
> > +   mutex_lock(&hdev->bus->controller->mutex);
> >  
> > /* Reset the device; full speed may morph to high speed */
> > /* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
> > @@ -4588,7 +4588,7 @@ fail:
> > hub_port_disable(hub, port1, 0);
> > update_devnum(udev, devnum);/* for disconnect processing */
> > }
> > -   mutex_unlock(&hdev->bus->usb_address0_mutex);
> > +   mutex_unlock(&hdev->bus->controller->mutex);
> > return retval;
> >  }
> 
> I don't think this is a good idea.  The driver core needs to be able to
> access the controller while this function is running.  You can
> introduce a new mutex if you want, perhaps in the primary hcd
> structure, but don't use bus->controller->mutex.

An explicit lock might be a good idea. I was trying to avoid adding
another lock so used the one in struct device as it appeared unused. The
XHCI code seems to only use the lock in struct xhci_hcd and ehci uses
struct ehci->lock.

btw I think this bug may be the same as reported at
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1437492


Re: [PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-04 Thread Alan Stern
On Thu, 4 Feb 2016, Chris Bainbridge wrote:

> The XHCI controller presents two USB buses to the system - one for USB 2
> and one for USB 3. When only one bus is locked there is a race condition
> during hub init that results in errors like:
> 
>  [   13.183701] usb 3-3: device descriptor read/all, error -110

What exactly is the race condition?  Why does locking both buses fix 
it?


> @@ -4312,7 +4312,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device 
> *udev, int port1,
>   if (oldspeed == USB_SPEED_LOW)
>   delay = HUB_LONG_RESET_TIME;
>  
> - mutex_lock(&hdev->bus->usb_address0_mutex);
> + mutex_lock(&hdev->bus->controller->mutex);
>  
>   /* Reset the device; full speed may morph to high speed */
>   /* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
> @@ -4588,7 +4588,7 @@ fail:
>   hub_port_disable(hub, port1, 0);
>   update_devnum(udev, devnum);/* for disconnect processing */
>   }
> - mutex_unlock(&hdev->bus->usb_address0_mutex);
> + mutex_unlock(&hdev->bus->controller->mutex);
>   return retval;
>  }

I don't think this is a good idea.  The driver core needs to be able to
access the controller while this function is running.  You can
introduce a new mutex if you want, perhaps in the primary hcd
structure, but don't use bus->controller->mutex.

Alan Stern



[PATCH] usb: host: xhci: Replace bus lock with host controller lock

2016-02-04 Thread Chris Bainbridge
The XHCI controller presents two USB buses to the system - one for USB 2
and one for USB 3. When only one bus is locked there is a race condition
during hub init that results in errors like:

 [   13.183701] usb 3-3: device descriptor read/all, error -110

On a test system this error occurred on 6% of boots. Fix this by locking
the bus controller instead of the bus, thus preventing simultaneous hub
init on both buses.

Fixes: 638139eb95d2 ("usb: hub: allow to process more usb hub events in 
parallel")
Signed-off-by: Chris Bainbridge 
---
 drivers/usb/core/hcd.c | 2 +-
 drivers/usb/core/hub.c | 8 
 include/linux/usb.h| 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index df0e3b92533a..c468d047ec5e 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -966,7 +966,7 @@ static void usb_bus_init (struct usb_bus *bus)
bus->bandwidth_allocated = 0;
bus->bandwidth_int_reqs  = 0;
bus->bandwidth_isoc_reqs = 0;
-   mutex_init(&bus->usb_address0_mutex);
+   mutex_init(&bus->devnum_next_mutex);
 
INIT_LIST_HEAD (&bus->bus_list);
 }
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 350dcd9af5d8..e03c2f05c2f6 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2066,7 +2066,7 @@ static void choose_devnum(struct usb_device *udev)
struct usb_bus  *bus = udev->bus;
 
/* be safe when more hub events are proceed in parallel */
-   mutex_lock(&bus->usb_address0_mutex);
+   mutex_lock(&bus->devnum_next_mutex);
if (udev->wusb) {
devnum = udev->portnum + 1;
BUG_ON(test_bit(devnum, bus->devmap.devicemap));
@@ -2084,7 +2084,7 @@ static void choose_devnum(struct usb_device *udev)
set_bit(devnum, bus->devmap.devicemap);
udev->devnum = devnum;
}
-   mutex_unlock(&bus->usb_address0_mutex);
+   mutex_unlock(&bus->devnum_next_mutex);
 }
 
 static void release_devnum(struct usb_device *udev)
@@ -4312,7 +4312,7 @@ hub_port_init(struct usb_hub *hub, struct usb_device 
*udev, int port1,
if (oldspeed == USB_SPEED_LOW)
delay = HUB_LONG_RESET_TIME;
 
-   mutex_lock(&hdev->bus->usb_address0_mutex);
+   mutex_lock(&hdev->bus->controller->mutex);
 
/* Reset the device; full speed may morph to high speed */
/* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
@@ -4588,7 +4588,7 @@ fail:
hub_port_disable(hub, port1, 0);
update_devnum(udev, devnum);/* for disconnect processing */
}
-   mutex_unlock(&hdev->bus->usb_address0_mutex);
+   mutex_unlock(&hdev->bus->controller->mutex);
return retval;
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 89533ba38691..6b736c82b9d1 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -371,14 +371,13 @@ struct usb_bus {
 
int devnum_next;/* Next open device number in
 * round-robin allocation */
+   struct mutex devnum_next_mutex; /* devnum_next mutex */
 
struct usb_devmap devmap;   /* device address allocation map */
struct usb_device *root_hub;/* Root hub */
struct usb_bus *hs_companion;   /* Companion EHCI bus, if any */
struct list_head bus_list;  /* list of busses */
 
-   struct mutex usb_address0_mutex; /* unaddressed device mutex */
-
int bandwidth_allocated;/* on this bus: how much of the time
 * reserved for periodic (intr/iso)
 * requests is used, on average?
-- 
2.1.4