[PATCH 1/2] Revert "usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers"
From: Shen Jing This reverts commit b4194da3f9087dd38d91b40f9bec42d59ce589a8 since it causes list corruption followed by kernel panic: Workqueue: adb ffs_aio_cancel_worker RIP: 0010:__list_add_valid+0x4d/0x70 Call Trace: insert_work+0x47/0xb0 __queue_work+0xf6/0x400 queue_work_on+0x65/0x70 dwc3_gadget_giveback+0x44/0x50 [dwc3] dwc3_gadget_ep_dequeue+0x83/0x2d0 [dwc3] ? finish_wait+0x80/0x80 usb_ep_dequeue+0x1e/0x90 process_one_work+0x18c/0x3b0 worker_thread+0x3c/0x390 ? process_one_work+0x3b0/0x3b0 kthread+0x11e/0x140 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x3a/0x50 This issue is seen with warm reboot stability testing. Signed-off-by: Shen Jing Signed-off-by: Saranya Gopal Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 26 -- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 3ada83d81bda..31e8bf3578c8 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -215,7 +215,6 @@ struct ffs_io_data { struct mm_struct *mm; struct work_struct work; - struct work_struct cancellation_work; struct usb_ep *ep; struct usb_request *req; @@ -1073,31 +1072,22 @@ ffs_epfile_open(struct inode *inode, struct file *file) return 0; } -static void ffs_aio_cancel_worker(struct work_struct *work) -{ - struct ffs_io_data *io_data = container_of(work, struct ffs_io_data, - cancellation_work); - - ENTER(); - - usb_ep_dequeue(io_data->ep, io_data->req); -} - static int ffs_aio_cancel(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; - struct ffs_data *ffs = io_data->ffs; + struct ffs_epfile *epfile = kiocb->ki_filp->private_data; int value; ENTER(); - if (likely(io_data && io_data->ep && io_data->req)) { - INIT_WORK(_data->cancellation_work, ffs_aio_cancel_worker); - queue_work(ffs->io_completion_wq, _data->cancellation_work); - value = -EINPROGRESS; - } else { + spin_lock_irq(>ffs->eps_lock); + + if (likely(io_data && io_data->ep && io_data->req)) + value = usb_ep_dequeue(io_data->ep, io_data->req); + else value = -EINVAL; - } + + spin_unlock_irq(>ffs->eps_lock); return value; } -- 2.19.1
[PATCH] Revert "usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers"
From: Shen Jing This reverts commit b4194da3f9087dd38d91b40f9bec42d59ce589a8 since it causes list corruption followed by kernel panic: Workqueue: adb ffs_aio_cancel_worker RIP: 0010:__list_add_valid+0x4d/0x70 Call Trace: insert_work+0x47/0xb0 __queue_work+0xf6/0x400 queue_work_on+0x65/0x70 dwc3_gadget_giveback+0x44/0x50 [dwc3] dwc3_gadget_ep_dequeue+0x83/0x2d0 [dwc3] ? finish_wait+0x80/0x80 usb_ep_dequeue+0x1e/0x90 process_one_work+0x18c/0x3b0 worker_thread+0x3c/0x390 ? process_one_work+0x3b0/0x3b0 kthread+0x11e/0x140 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x3a/0x50 This issue is seen with warm reboot stability testing. Signed-off-by: Shen Jing Signed-off-by: Saranya Gopal Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 26 -- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 645b0c0a763f..9ec666299430 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -219,7 +219,6 @@ struct ffs_io_data { struct mm_struct *mm; struct work_struct work; - struct work_struct cancellation_work; struct usb_ep *ep; struct usb_request *req; @@ -1154,31 +1153,22 @@ ffs_epfile_open(struct inode *inode, struct file *file) return 0; } -static void ffs_aio_cancel_worker(struct work_struct *work) -{ - struct ffs_io_data *io_data = container_of(work, struct ffs_io_data, - cancellation_work); - - ENTER(); - - usb_ep_dequeue(io_data->ep, io_data->req); -} - static int ffs_aio_cancel(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; - struct ffs_data *ffs = io_data->ffs; + struct ffs_epfile *epfile = kiocb->ki_filp->private_data; int value; ENTER(); - if (likely(io_data && io_data->ep && io_data->req)) { - INIT_WORK(_data->cancellation_work, ffs_aio_cancel_worker); - queue_work(ffs->io_completion_wq, _data->cancellation_work); - value = -EINPROGRESS; - } else { + spin_lock_irq(>ffs->eps_lock); + + if (likely(io_data && io_data->ep && io_data->req)) + value = usb_ep_dequeue(io_data->ep, io_data->req); + else value = -EINVAL; - } + + spin_unlock_irq(>ffs->eps_lock); return value; } -- 2.19.1
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On 06/29/2018 08:32 AM, Felipe Balbi wrote: > > Hi, > > Alan Stern writes: >> On Thu, 21 Jun 2018, Roger Quadros wrote: >> > Can we avoid the spin_lock() and the work-queue and call usb_ep_dequeue() > directly from here? >> What is the purpose of the spin_lock()? I agree that the lock doesn't seem to be necessary. But I believe the whole thing is already running in non-sleeping context, even before the spinlock is taken. So this wouldn't help much. Even the io_cancel() syscall takes a spinlock before invoking the cancel function. So this issue is not exclusive to program termination. Are there any documented guidelines on which context usb_ep_dequeue() should be able to be called in? The sleep in the dwc3 driver seems to be a recent addition. >>> >>> drivers/usb/udc/gadget/core.c has the only documentation, but context is >>> not mentioned there. >>> Felipe, what do you suggest? >> >> As far as I remember, usb_ep_dequeue() is supposed to be more or less >> analogous to usb_ep_queue(); drivers should be allowed to call either >> routine in an atomic context. > > Hmm, that's what I remember, but we don't have that documented and dwc3 > has a sleep in its dequeue, which I need to remove for other reasons > anyway. > > Can we get a patch updating documentation to make it clear that both > queue and dequeue should be callable from any context? > Felipe, for the DWC3 do you see an issue with moving the giveback and the cleanup of the TRBs simply into END_TRANSFER interrupt? That seems to work for me for fixing the issue, but I don't have a full understanding of all the dependencies of the DWC3 gadget driver and don't know if it could introduce other issues.
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On 09/25/2018 02:46 PM, Vincent Pelletier wrote: > Hello, > > On Thu, 2 Aug 2018 14:23:51 +, Vincent Pelletier > wrote: >> On Thu, 2 Aug 2018 00:45:14 +, "He, Bo" wrote: >>> Your patch fix the issue BUG: scheduling while atomic: >> >> Yes, although from my understanding of Felipe's answer, the actual bug >> is the "scheduling" part (sleeping in dwc3 UDC) rather than the >> "atomic" part. >> >> So my patch addresses, still if my understanding is correct, the wrong >> half of the problem, and even introduced the regression you identified. >> Hence my uncertainty... > > I notice that neither He's patch, nor a dwc3 change to prevent it from > scheduling inside usb_ep_dequeue are in Linus' master. Please correct > me if I missed something. > > Just in case my previous emails were not clear: > - I have no objection to He's patch on its own (and I do not know the > code nearly enough to provide a meaningful reviewed-by). > - I do not intend to work on making changes to dwc3 gadget to stop it > from scheduling in usb_ep_dequeue in the foreseeable future. > > So if there is no ongoing work on dwc3 cancel behaviour (Felipe ?), > please do resume/carry on with reviewing and integrating He's patch. > > It is only *if* dwc3 cancel stops scheduling that I believe my patch > should be reverted (here is the hash as of Linus' master): > commit d52e4d0c0c428bf2ba35074a7495cdb28e2efbae > Author: Vincent Pelletier > Date: Wed Jun 13 11:05:06 2018 + > > usb: gadget: ffs: Fix BUG when userland exits with submitted AIO > transfers > > This bug happens only when the UDC needs to sleep during usb_ep_dequeue, > as is the case for (at least) dwc3. > and, in my understanding, a consequence is that He's fix would not > be needed anymore - the bug my patch introduced disappearing in the > revert. Hi, I just started testing with your patch applied and the patch seems to contain a race condition on its own. When a aio_cancel() is called ffs_aio_cancel_worker() is queued. Now what might happen is that the URB completes on its own before the work item has a chance to run. And it seems that ffs_copy_worker() can even overtake the cancel work. In that case the data structure associated with the URB is deleted while the cancel_worker is still queued. I get the following output on the console [ 91.974151] ODEBUG: free active (active state 0) object type: work_struct hint: ffs_aio_cancel_worker+0x0/0x20 [ 92.211124] [] debug_print_object+0xac/0xc0 [ 92.216854] [] __debug_check_no_obj_freed+0x1b0/0x208 [ 92.223451] [] debug_check_no_obj_freed+0x1c/0x28 [ 92.229701] [] kfree+0x78/0xc8 [ 92.234302] [] ffs_user_copy_worker+0xd0/0x128 [ 92.240293] [] process_one_work+0x1e0/0x3c0 [ 92.246022] [] worker_thread+0x54/0x410 [ 92.251403] [] kthread+0x104/0x130 [ 92.256351] [] ret_from_fork+0x10/0x18 - Lars
RE: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hi, Vincent: We tried my patch doesn't fix the issue totally, we still see the kernel panic. currently solution in our test is: Rvert "usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers" and only add the below patch to use the in_interrupt() to avoid the wait_event_lock_irq() is called in interrupt context. Subject: [PATCH] fix the wait_event_lock_irq run in interrupt context the patch is to fix the below bug: BUG: scheduling while atomic: SettingsProvide/3433/0x0104 Preemption disabled at: [] __do_softirq+0x53/0x31b CPU: 1 PID: 3433 Comm: SettingsProvide Tainted: P U ilt-2e5dc0ac-g3f662bf-dirty #8 Call Trace: dump_stack+0x70/0x9e ? __do_softirq+0x53/0x31b __schedule_bug+0x7f/0xd0 __schedule+0x61d/0x860 ? _raw_spin_unlock_irqrestore+0x28/0x50 ? prepare_to_wait_event+0x87/0x170 schedule+0x36/0x90 dwc3_gadget_ep_dequeue+0x27a/0x2e0 [dwc3] ? finish_wait+0x90/0x90 usb_ep_dequeue+0x23/0x90 ffs_aio_cancel+0x4c/0x70 kiocb_cancel+0x40/0x50 free_ioctx_users+0x6e/0x100 percpu_ref_switch_to_atomic_rcu+0x159/0x160 rcu_process_callbacks+0x1a7/0x520 __do_softirq+0x11a/0x31b irq_exit+0xb5/0xc0 smp_apic_timer_interrupt+0x7c/0x160 the BUG is introduced form the patch: commit: cf3113d89 usb: dwc3: gadget: properly increment dequeue pointer on ep_dequeue Signed-off-by: he, bo --- drivers/usb/dwc3/gadget.c | 112 +++--- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 7120678..386fd82 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1413,65 +1413,67 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, /* wait until it is processed */ dwc3_stop_active_transfer(dwc, dep->number, true); - /* -* If request was already started, this means we had to -* stop the transfer. With that we also need to ignore -* all TRBs used by the request, however TRBs can only -* be modified after completion of END_TRANSFER -* command. So what we do here is that we wait for -* END_TRANSFER completion and only after that, we jump -* over TRBs by clearing HWO and incrementing dequeue -* pointer. -* -* Note that we have 2 possible types of transfers here: -* -* i) Linear buffer request -* ii) SG-list based request -* -* SG-list based requests will have r->num_pending_sgs -* set to a valid number (> 0). Linear requests, -* normally use a single TRB. -* -* For each of these two cases, if r->unaligned flag is -* set, one extra TRB has been used to align transfer -* size to wMaxPacketSize. -* -* All of these cases need to be taken into -* consideration so we don't mess up our TRB ring -* pointers. -*/ - wait_event_lock_irq(dep->wait_end_transfer, - !(dep->flags & DWC3_EP_END_TRANSFER_PENDING), - dwc->lock); - - if (!r->trb) - goto out0; - - if (r->num_pending_sgs) { - struct dwc3_trb *trb; - int i = 0; - - for (i = 0; i < r->num_pending_sgs; i++) { - trb = r->trb + i; - trb->ctrl &= ~DWC3_TRB_CTRL_HWO; - dwc3_ep_inc_deq(dep); - } + if(likely(!in_interrupt())) { + /* +* If request was already started, this means we had to +* stop the transfer. With that we also need to ignore +* all TRBs used by the request, however TRBs can only +* be modified after completion of END_TRANSFER +* command. So what we do here is that we wait for +* END_TRANSFER completion and only after that, we jump +* over TRBs by clearing HWO and incrementing dequeue +* pointer. +* +
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hello, On Thu, 2 Aug 2018 14:23:51 +, Vincent Pelletier wrote: > On Thu, 2 Aug 2018 00:45:14 +, "He, Bo" wrote: > > Your patch fix the issue BUG: scheduling while atomic: > > Yes, although from my understanding of Felipe's answer, the actual bug > is the "scheduling" part (sleeping in dwc3 UDC) rather than the > "atomic" part. > > So my patch addresses, still if my understanding is correct, the wrong > half of the problem, and even introduced the regression you identified. > Hence my uncertainty... I notice that neither He's patch, nor a dwc3 change to prevent it from scheduling inside usb_ep_dequeue are in Linus' master. Please correct me if I missed something. Just in case my previous emails were not clear: - I have no objection to He's patch on its own (and I do not know the code nearly enough to provide a meaningful reviewed-by). - I do not intend to work on making changes to dwc3 gadget to stop it from scheduling in usb_ep_dequeue in the foreseeable future. So if there is no ongoing work on dwc3 cancel behaviour (Felipe ?), please do resume/carry on with reviewing and integrating He's patch. It is only *if* dwc3 cancel stops scheduling that I believe my patch should be reverted (here is the hash as of Linus' master): commit d52e4d0c0c428bf2ba35074a7495cdb28e2efbae Author: Vincent Pelletier Date: Wed Jun 13 11:05:06 2018 + usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers This bug happens only when the UDC needs to sleep during usb_ep_dequeue, as is the case for (at least) dwc3. and, in my understanding, a consequence is that He's fix would not be needed anymore - the bug my patch introduced disappearing in the revert. Regards, -- Vincent Pelletier
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On Thu, 2 Aug 2018 00:45:14 +, "He, Bo" wrote: > Your patch fix the issue BUG: scheduling while atomic: Yes, although from my understanding of Felipe's answer, the actual bug is the "scheduling" part (sleeping in dwc3 UDC) rather than the "atomic" part. So my patch addresses, still if my understanding is correct, the wrong half of the problem, and even introduced the regression you identified. Hence my uncertainty... -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Your patch fix the issue BUG: scheduling while atomic: BUG: scheduling while atomic: SettingsProvide/3433/0x0104 Preemption disabled at: [] __do_softirq+0x53/0x31b CPU: 1 PID: 3433 Comm: SettingsProvide Tainted: P U ilt-2e5dc0ac-g3f662bf-dirty #8 Call Trace: dump_stack+0x70/0x9e __schedule_bug+0x7f/0xd0 __schedule+0x61d/0x860 schedule+0x36/0x90 dwc3_gadget_ep_dequeue+0x27a/0x2e0 [dwc3] usb_ep_dequeue+0x23/0x90 ffs_aio_cancel+0x4c/0x70 kiocb_cancel+0x40/0x50 free_ioctx_users+0x6e/0x100 percpu_ref_switch_to_atomic_rcu+0x159/0x160 rcu_process_callbacks+0x1a7/0x520 __do_softirq+0x11a/0x31b irq_exit+0xb5/0xc0 smp_apic_timer_interrupt+0x7c/0x160 the BUG is introduced form the patch: commit: cf3113d89 usb: dwc3: gadget: properly increment dequeue pointer on ep_dequeue the commit: cf3113d89 call the below function maybe in softirq context: wait_event_lock_irq(dep->wait_end_transfer, !(dep->flags & DWC3_EP_END_TRANSFER_PENDING), dwc->lock); -Original Message- From: Vincent Pelletier Sent: Wednesday, August 1, 2018 11:04 PM To: Felipe Balbi Cc: Alan Stern ; Roger Quadros ; Lars-Peter Clausen ; Sam Protsenko ; linux-usb@vger.kernel.org; Praneeth Bajjuri ; He, Bo ; Bai, Jie A Subject: Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers Hello, Bo He, CC'ed, found a regression introduced in my patch, discussed in this thread, and submitted a patch: Subject: [PATCH] fix panic at pwq_activate_delayed_work. Date: Wed, 1 Aug 2018 10:14:38 + Message-ID: On Fri, 29 Jun 2018 09:32:43 +0300, Felipe Balbi wrote: > Hmm, that's what I remember, but we don't have that documented and > dwc3 has a sleep in its dequeue, which I need to remove for other > reasons anyway. Given the above comment from Felipe, I expected my patch would be dropped in favour of making dwc3 not sleep in dequeue, as it seems to be the actual root cause. Should my patch be reverted ? It adds complexity which, I believe, becomes superfluous if dequeue does not sleep anywhere. Or maybe non-sleeping dequeue is not there yet, and a solution right now (later revertable) is better, in which case my change would be worth fixing ? -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hello, Bo He, CC'ed, found a regression introduced in my patch, discussed in this thread, and submitted a patch: Subject: [PATCH] fix panic at pwq_activate_delayed_work. Date: Wed, 1 Aug 2018 10:14:38 + Message-ID: On Fri, 29 Jun 2018 09:32:43 +0300, Felipe Balbi wrote: > Hmm, that's what I remember, but we don't have that documented and dwc3 > has a sleep in its dequeue, which I need to remove for other reasons > anyway. Given the above comment from Felipe, I expected my patch would be dropped in favour of making dwc3 not sleep in dequeue, as it seems to be the actual root cause. Should my patch be reverted ? It adds complexity which, I believe, becomes superfluous if dequeue does not sleep anywhere. Or maybe non-sleeping dequeue is not there yet, and a solution right now (later revertable) is better, in which case my change would be worth fixing ? -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hi, Alan Stern writes: > On Thu, 21 Jun 2018, Roger Quadros wrote: > >> >> Can we avoid the spin_lock() and the work-queue and call usb_ep_dequeue() >> >> directly from here? >> >>> What is the purpose of the spin_lock()? >> > >> > I agree that the lock doesn't seem to be necessary. But I believe the whole >> > thing is already running in non-sleeping context, even before the spinlock >> > is taken. So this wouldn't help much. >> > >> > Even the io_cancel() syscall takes a spinlock before invoking the cancel >> > function. So this issue is not exclusive to program termination. >> > >> > Are there any documented guidelines on which context usb_ep_dequeue() >> > should >> > be able to be called in? The sleep in the dwc3 driver seems to be a recent >> > addition. >> >> drivers/usb/udc/gadget/core.c has the only documentation, but context is not >> mentioned there. >> Felipe, what do you suggest? > > As far as I remember, usb_ep_dequeue() is supposed to be more or less > analogous to usb_ep_queue(); drivers should be allowed to call either > routine in an atomic context. Hmm, that's what I remember, but we don't have that documented and dwc3 has a sleep in its dequeue, which I need to remove for other reasons anyway. Can we get a patch updating documentation to make it clear that both queue and dequeue should be callable from any context? -- balbi -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On Thu, 21 Jun 2018, Roger Quadros wrote: > >> Can we avoid the spin_lock() and the work-queue and call usb_ep_dequeue() > >> directly from here? > >>> What is the purpose of the spin_lock()? > > > > I agree that the lock doesn't seem to be necessary. But I believe the whole > > thing is already running in non-sleeping context, even before the spinlock > > is taken. So this wouldn't help much. > > > > Even the io_cancel() syscall takes a spinlock before invoking the cancel > > function. So this issue is not exclusive to program termination. > > > > Are there any documented guidelines on which context usb_ep_dequeue() should > > be able to be called in? The sleep in the dwc3 driver seems to be a recent > > addition. > > drivers/usb/udc/gadget/core.c has the only documentation, but context is not > mentioned there. > Felipe, what do you suggest? As far as I remember, usb_ep_dequeue() is supposed to be more or less analogous to usb_ep_queue(); drivers should be allowed to call either routine in an atomic context. Alan Stern -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On 21/06/18 13:52, Lars-Peter Clausen wrote: > On 06/21/2018 10:29 AM, Roger Quadros wrote: > [...] static int ffs_aio_cancel(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; - struct ffs_epfile *epfile = kiocb->ki_filp->private_data; + struct ffs_data *ffs = io_data->ffs; int value; ENTER(); - spin_lock_irq(>ffs->eps_lock); - - if (likely(io_data && io_data->ep && io_data->req)) - value = usb_ep_dequeue(io_data->ep, io_data->req); - else + if (likely(io_data && io_data->ep && io_data->req)) { + INIT_WORK(_data->cancellation_work, ffs_aio_cancel_worker); + queue_work(ffs->io_completion_wq, _data->cancellation_work); + value = -EINPROGRESS; + } else { value = -EINVAL; - - spin_unlock_irq(>ffs->eps_lock); + } >> >> Can we avoid the spin_lock() and the work-queue and call usb_ep_dequeue() >> directly from here? >>> What is the purpose of the spin_lock()? > > I agree that the lock doesn't seem to be necessary. But I believe the whole > thing is already running in non-sleeping context, even before the spinlock > is taken. So this wouldn't help much. > > Even the io_cancel() syscall takes a spinlock before invoking the cancel > function. So this issue is not exclusive to program termination. > > Are there any documented guidelines on which context usb_ep_dequeue() should > be able to be called in? The sleep in the dwc3 driver seems to be a recent > addition. drivers/usb/udc/gadget/core.c has the only documentation, but context is not mentioned there. Felipe, what do you suggest? > >> return value; } -- 2.17.1 >> > -- cheers, -roger Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On 06/21/2018 10:29 AM, Roger Quadros wrote: [...] >>> static int ffs_aio_cancel(struct kiocb *kiocb) >>> { >>> struct ffs_io_data *io_data = kiocb->private; >>> - struct ffs_epfile *epfile = kiocb->ki_filp->private_data; >>> + struct ffs_data *ffs = io_data->ffs; >>> int value; >>> >>> ENTER(); >>> >>> - spin_lock_irq(>ffs->eps_lock); >>> - >>> - if (likely(io_data && io_data->ep && io_data->req)) >>> - value = usb_ep_dequeue(io_data->ep, io_data->req); >>> - else >>> + if (likely(io_data && io_data->ep && io_data->req)) { >>> + INIT_WORK(_data->cancellation_work, >>> ffs_aio_cancel_worker); >>> + queue_work(ffs->io_completion_wq, >>> _data->cancellation_work); >>> + value = -EINPROGRESS; >>> + } else { >>> value = -EINVAL; >>> - >>> - spin_unlock_irq(>ffs->eps_lock); >>> + } > > Can we avoid the spin_lock() and the work-queue and call usb_ep_dequeue() > directly from here? >> What is the purpose of the spin_lock()? I agree that the lock doesn't seem to be necessary. But I believe the whole thing is already running in non-sleeping context, even before the spinlock is taken. So this wouldn't help much. Even the io_cancel() syscall takes a spinlock before invoking the cancel function. So this issue is not exclusive to program termination. Are there any documented guidelines on which context usb_ep_dequeue() should be able to be called in? The sleep in the dwc3 driver seems to be a recent addition. > >>> >>> return value; >>> } >>> -- >>> 2.17.1 >>> > -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
+Lars-Peter Vincent, On 14/06/18 16:23, Sam Protsenko wrote: > + Roger Quadros > + Praneeth Bajjuri > > Tested-by: Sam Protsenko > > I've tested it on X15 board (DWC3 controller) on Android master, by > doing "adb root". Without this patch I see backtrace and kernel panic > (the same error as described in commit message). When this patch is > applied, everything works fine. > > Can we please merge this patch, it fixes major bug for us? > > Thanks! > > > On 13 June 2018 at 14:05, Vincent Pelletier wrote: >> This bug happens only when the UDC needs to sleep during usb_ep_dequeue, >> as is the case for (at least) dwc3. >> >> [ 382.200896] BUG: scheduling while atomic: screen/1808/0x0100 >> [ 382.207124] 4 locks held by screen/1808: >> [ 382.211266] #0: (rcu_callback){}, at: [] >> rcu_process_callbacks+0x260/0x440 >> [ 382.219949] #1: (rcu_read_lock_sched){}, at: [] >> percpu_ref_switch_to_atomic_rcu+0xb0/0x130 >> [ 382.230034] #2: (&(>ctx_lock)->rlock){}, at: [] >> free_ioctx_users+0x23/0xd0 >> [ 382.230096] #3: (&(>eps_lock)->rlock){}, at: [] >> ffs_aio_cancel+0x20/0x60 [usb_f_fs] >> [ 382.230160] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio >> bluetooth ecdh_generic brcmfmac brcmutil intel_powerclamp coretemp dwc3 >> kvm_intel ulpi udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc >> dwc3_pci aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd gpio_keys >> usbcore basincove_gpadc industrialio usb_common >> [ 382.230407] CPU: 1 PID: 1808 Comm: screen Not tainted 4.14.0-edison+ #117 >> [ 382.230416] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS >> 542 2015.01.21:18.19.48 >> [ 382.230425] Call Trace: >> [ 382.230438] >> [ 382.230466] dump_stack+0x47/0x62 >> [ 382.230498] __schedule_bug+0x61/0x80 >> [ 382.230522] __schedule+0x43/0x7a0 >> [ 382.230587] schedule+0x5f/0x70 >> [ 382.230625] dwc3_gadget_ep_dequeue+0x14c/0x270 [dwc3] >> [ 382.230669] ? do_wait_intr_irq+0x70/0x70 >> [ 382.230724] usb_ep_dequeue+0x19/0x90 [udc_core] >> [ 382.230770] ffs_aio_cancel+0x37/0x60 [usb_f_fs] >> [ 382.230798] kiocb_cancel+0x31/0x40 >> [ 382.230822] free_ioctx_users+0x4d/0xd0 >> [ 382.230858] percpu_ref_switch_to_atomic_rcu+0x10a/0x130 >> [ 382.230881] ? percpu_ref_exit+0x40/0x40 >> [ 382.230904] rcu_process_callbacks+0x2b3/0x440 >> [ 382.230965] __do_softirq+0xf8/0x26b >> [ 382.231011] ? __softirqentry_text_start+0x8/0x8 >> [ 382.231033] do_softirq_own_stack+0x22/0x30 >> [ 382.231042] >> [ 382.231071] irq_exit+0x45/0xc0 >> [ 382.231089] smp_apic_timer_interrupt+0x13c/0x150 >> [ 382.231118] apic_timer_interrupt+0x35/0x3c >> [ 382.231132] EIP: __copy_user_ll+0xe2/0xf0 >> [ 382.231142] EFLAGS: 00210293 CPU: 1 >> [ 382.231154] EAX: bfd4508c EBX: 0004 ECX: 0003 EDX: f3d8fe50 >> [ 382.231165] ESI: f3d8fe51 EDI: bfd4508d EBP: f3d8fe14 ESP: f3d8fe08 >> [ 382.231176] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 >> [ 382.231265] core_sys_select+0x25f/0x320 >> [ 382.231346] ? __wake_up_common_lock+0x62/0x80 >> [ 382.231399] ? tty_ldisc_deref+0x13/0x20 >> [ 382.231438] ? ldsem_up_read+0x1b/0x40 >> [ 382.231459] ? tty_ldisc_deref+0x13/0x20 >> [ 382.231479] ? tty_write+0x29f/0x2e0 >> [ 382.231514] ? n_tty_ioctl+0xe0/0xe0 >> [ 382.231541] ? tty_write_unlock+0x30/0x30 >> [ 382.231566] ? __vfs_write+0x22/0x110 >> [ 382.231604] ? security_file_permission+0x2f/0xd0 >> [ 382.231635] ? rw_verify_area+0xac/0x120 >> [ 382.231677] ? vfs_write+0x103/0x180 >> [ 382.231711] SyS_select+0x87/0xc0 >> [ 382.231739] ? SyS_write+0x42/0x90 >> [ 382.231781] do_fast_syscall_32+0xd6/0x1a0 >> [ 382.231836] entry_SYSENTER_32+0x47/0x71 >> [ 382.231848] EIP: 0xb7f75b05 >> [ 382.231857] EFLAGS: 0246 CPU: 1 >> [ 382.231868] EAX: ffda EBX: 0400 ECX: bfd4508c EDX: bfd4510c >> [ 382.231878] ESI: EDI: EBP: ESP: bfd45020 >> [ 382.231889] DS: 007b ES: 007b FS: GS: 0033 SS: 007b >> [ 382.232281] softirq: huh, entered softirq 9 RCU c10b4d90 with >> preempt_count 0100, exited with ? >> >> Signed-off-by: Vincent Pelletier >> --- >> drivers/usb/gadget/function/f_fs.c | 26 ++ >> 1 file changed, 18 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/usb/gadget/function/f_fs.c >> b/drivers/usb/gadget/function/f_fs.c >> index 199d25700050..01841fdb3144 100644 >> --- a/drivers/usb/gadget/function/f_fs.c >> +++ b/drivers/usb/gadget/function/f_fs.c >> @@ -215,6 +215,7 @@ struct ffs_io_data { >> >> struct mm_struct *mm; >> struct work_struct work; >> + struct work_struct cancellation_work; >> >> struct usb_ep *ep; >> struct usb_request *req; >> @@ -1072,22 +1073,31 @@ ffs_epfile_open(struct inode *inode, struct file >> *file) >> return 0; >> } >> >> +static void ffs_aio_cancel_worker(struct work_struct *work) >> +{ >> + struct ffs_io_data *io_data =
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hi Felipe, If there is no objections, can we please pull this patch? It fixes kernel panic for us (scheduling in atomic context). Thanks! On 14 June 2018 at 16:23, Sam Protsenko wrote: > + Roger Quadros > + Praneeth Bajjuri > > Tested-by: Sam Protsenko > > I've tested it on X15 board (DWC3 controller) on Android master, by > doing "adb root". Without this patch I see backtrace and kernel panic > (the same error as described in commit message). When this patch is > applied, everything works fine. > > Can we please merge this patch, it fixes major bug for us? > > Thanks! > > > On 13 June 2018 at 14:05, Vincent Pelletier wrote: >> This bug happens only when the UDC needs to sleep during usb_ep_dequeue, >> as is the case for (at least) dwc3. >> >> [ 382.200896] BUG: scheduling while atomic: screen/1808/0x0100 >> [ 382.207124] 4 locks held by screen/1808: >> [ 382.211266] #0: (rcu_callback){}, at: [] >> rcu_process_callbacks+0x260/0x440 >> [ 382.219949] #1: (rcu_read_lock_sched){}, at: [] >> percpu_ref_switch_to_atomic_rcu+0xb0/0x130 >> [ 382.230034] #2: (&(>ctx_lock)->rlock){}, at: [] >> free_ioctx_users+0x23/0xd0 >> [ 382.230096] #3: (&(>eps_lock)->rlock){}, at: [] >> ffs_aio_cancel+0x20/0x60 [usb_f_fs] >> [ 382.230160] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio >> bluetooth ecdh_generic brcmfmac brcmutil intel_powerclamp coretemp dwc3 >> kvm_intel ulpi udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc >> dwc3_pci aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd gpio_keys >> usbcore basincove_gpadc industrialio usb_common >> [ 382.230407] CPU: 1 PID: 1808 Comm: screen Not tainted 4.14.0-edison+ #117 >> [ 382.230416] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS >> 542 2015.01.21:18.19.48 >> [ 382.230425] Call Trace: >> [ 382.230438] >> [ 382.230466] dump_stack+0x47/0x62 >> [ 382.230498] __schedule_bug+0x61/0x80 >> [ 382.230522] __schedule+0x43/0x7a0 >> [ 382.230587] schedule+0x5f/0x70 >> [ 382.230625] dwc3_gadget_ep_dequeue+0x14c/0x270 [dwc3] >> [ 382.230669] ? do_wait_intr_irq+0x70/0x70 >> [ 382.230724] usb_ep_dequeue+0x19/0x90 [udc_core] >> [ 382.230770] ffs_aio_cancel+0x37/0x60 [usb_f_fs] >> [ 382.230798] kiocb_cancel+0x31/0x40 >> [ 382.230822] free_ioctx_users+0x4d/0xd0 >> [ 382.230858] percpu_ref_switch_to_atomic_rcu+0x10a/0x130 >> [ 382.230881] ? percpu_ref_exit+0x40/0x40 >> [ 382.230904] rcu_process_callbacks+0x2b3/0x440 >> [ 382.230965] __do_softirq+0xf8/0x26b >> [ 382.231011] ? __softirqentry_text_start+0x8/0x8 >> [ 382.231033] do_softirq_own_stack+0x22/0x30 >> [ 382.231042] >> [ 382.231071] irq_exit+0x45/0xc0 >> [ 382.231089] smp_apic_timer_interrupt+0x13c/0x150 >> [ 382.231118] apic_timer_interrupt+0x35/0x3c >> [ 382.231132] EIP: __copy_user_ll+0xe2/0xf0 >> [ 382.231142] EFLAGS: 00210293 CPU: 1 >> [ 382.231154] EAX: bfd4508c EBX: 0004 ECX: 0003 EDX: f3d8fe50 >> [ 382.231165] ESI: f3d8fe51 EDI: bfd4508d EBP: f3d8fe14 ESP: f3d8fe08 >> [ 382.231176] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 >> [ 382.231265] core_sys_select+0x25f/0x320 >> [ 382.231346] ? __wake_up_common_lock+0x62/0x80 >> [ 382.231399] ? tty_ldisc_deref+0x13/0x20 >> [ 382.231438] ? ldsem_up_read+0x1b/0x40 >> [ 382.231459] ? tty_ldisc_deref+0x13/0x20 >> [ 382.231479] ? tty_write+0x29f/0x2e0 >> [ 382.231514] ? n_tty_ioctl+0xe0/0xe0 >> [ 382.231541] ? tty_write_unlock+0x30/0x30 >> [ 382.231566] ? __vfs_write+0x22/0x110 >> [ 382.231604] ? security_file_permission+0x2f/0xd0 >> [ 382.231635] ? rw_verify_area+0xac/0x120 >> [ 382.231677] ? vfs_write+0x103/0x180 >> [ 382.231711] SyS_select+0x87/0xc0 >> [ 382.231739] ? SyS_write+0x42/0x90 >> [ 382.231781] do_fast_syscall_32+0xd6/0x1a0 >> [ 382.231836] entry_SYSENTER_32+0x47/0x71 >> [ 382.231848] EIP: 0xb7f75b05 >> [ 382.231857] EFLAGS: 0246 CPU: 1 >> [ 382.231868] EAX: ffda EBX: 0400 ECX: bfd4508c EDX: bfd4510c >> [ 382.231878] ESI: EDI: EBP: ESP: bfd45020 >> [ 382.231889] DS: 007b ES: 007b FS: GS: 0033 SS: 007b >> [ 382.232281] softirq: huh, entered softirq 9 RCU c10b4d90 with >> preempt_count 0100, exited with ? >> >> Signed-off-by: Vincent Pelletier >> --- >> drivers/usb/gadget/function/f_fs.c | 26 ++ >> 1 file changed, 18 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/usb/gadget/function/f_fs.c >> b/drivers/usb/gadget/function/f_fs.c >> index 199d25700050..01841fdb3144 100644 >> --- a/drivers/usb/gadget/function/f_fs.c >> +++ b/drivers/usb/gadget/function/f_fs.c >> @@ -215,6 +215,7 @@ struct ffs_io_data { >> >> struct mm_struct *mm; >> struct work_struct work; >> + struct work_struct cancellation_work; >> >> struct usb_ep *ep; >> struct usb_request *req; >> @@ -1072,22 +1073,31 @@ ffs_epfile_open(struct inode *inode, struct file >> *file) >> return 0; >>
Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
+ Roger Quadros + Praneeth Bajjuri Tested-by: Sam Protsenko I've tested it on X15 board (DWC3 controller) on Android master, by doing "adb root". Without this patch I see backtrace and kernel panic (the same error as described in commit message). When this patch is applied, everything works fine. Can we please merge this patch, it fixes major bug for us? Thanks! On 13 June 2018 at 14:05, Vincent Pelletier wrote: > This bug happens only when the UDC needs to sleep during usb_ep_dequeue, > as is the case for (at least) dwc3. > > [ 382.200896] BUG: scheduling while atomic: screen/1808/0x0100 > [ 382.207124] 4 locks held by screen/1808: > [ 382.211266] #0: (rcu_callback){}, at: [] > rcu_process_callbacks+0x260/0x440 > [ 382.219949] #1: (rcu_read_lock_sched){}, at: [] > percpu_ref_switch_to_atomic_rcu+0xb0/0x130 > [ 382.230034] #2: (&(>ctx_lock)->rlock){}, at: [] > free_ioctx_users+0x23/0xd0 > [ 382.230096] #3: (&(>eps_lock)->rlock){}, at: [] > ffs_aio_cancel+0x20/0x60 [usb_f_fs] > [ 382.230160] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio > bluetooth ecdh_generic brcmfmac brcmutil intel_powerclamp coretemp dwc3 > kvm_intel ulpi udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc dwc3_pci > aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd gpio_keys usbcore > basincove_gpadc industrialio usb_common > [ 382.230407] CPU: 1 PID: 1808 Comm: screen Not tainted 4.14.0-edison+ #117 > [ 382.230416] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS > 542 2015.01.21:18.19.48 > [ 382.230425] Call Trace: > [ 382.230438] > [ 382.230466] dump_stack+0x47/0x62 > [ 382.230498] __schedule_bug+0x61/0x80 > [ 382.230522] __schedule+0x43/0x7a0 > [ 382.230587] schedule+0x5f/0x70 > [ 382.230625] dwc3_gadget_ep_dequeue+0x14c/0x270 [dwc3] > [ 382.230669] ? do_wait_intr_irq+0x70/0x70 > [ 382.230724] usb_ep_dequeue+0x19/0x90 [udc_core] > [ 382.230770] ffs_aio_cancel+0x37/0x60 [usb_f_fs] > [ 382.230798] kiocb_cancel+0x31/0x40 > [ 382.230822] free_ioctx_users+0x4d/0xd0 > [ 382.230858] percpu_ref_switch_to_atomic_rcu+0x10a/0x130 > [ 382.230881] ? percpu_ref_exit+0x40/0x40 > [ 382.230904] rcu_process_callbacks+0x2b3/0x440 > [ 382.230965] __do_softirq+0xf8/0x26b > [ 382.231011] ? __softirqentry_text_start+0x8/0x8 > [ 382.231033] do_softirq_own_stack+0x22/0x30 > [ 382.231042] > [ 382.231071] irq_exit+0x45/0xc0 > [ 382.231089] smp_apic_timer_interrupt+0x13c/0x150 > [ 382.231118] apic_timer_interrupt+0x35/0x3c > [ 382.231132] EIP: __copy_user_ll+0xe2/0xf0 > [ 382.231142] EFLAGS: 00210293 CPU: 1 > [ 382.231154] EAX: bfd4508c EBX: 0004 ECX: 0003 EDX: f3d8fe50 > [ 382.231165] ESI: f3d8fe51 EDI: bfd4508d EBP: f3d8fe14 ESP: f3d8fe08 > [ 382.231176] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 > [ 382.231265] core_sys_select+0x25f/0x320 > [ 382.231346] ? __wake_up_common_lock+0x62/0x80 > [ 382.231399] ? tty_ldisc_deref+0x13/0x20 > [ 382.231438] ? ldsem_up_read+0x1b/0x40 > [ 382.231459] ? tty_ldisc_deref+0x13/0x20 > [ 382.231479] ? tty_write+0x29f/0x2e0 > [ 382.231514] ? n_tty_ioctl+0xe0/0xe0 > [ 382.231541] ? tty_write_unlock+0x30/0x30 > [ 382.231566] ? __vfs_write+0x22/0x110 > [ 382.231604] ? security_file_permission+0x2f/0xd0 > [ 382.231635] ? rw_verify_area+0xac/0x120 > [ 382.231677] ? vfs_write+0x103/0x180 > [ 382.231711] SyS_select+0x87/0xc0 > [ 382.231739] ? SyS_write+0x42/0x90 > [ 382.231781] do_fast_syscall_32+0xd6/0x1a0 > [ 382.231836] entry_SYSENTER_32+0x47/0x71 > [ 382.231848] EIP: 0xb7f75b05 > [ 382.231857] EFLAGS: 0246 CPU: 1 > [ 382.231868] EAX: ffda EBX: 0400 ECX: bfd4508c EDX: bfd4510c > [ 382.231878] ESI: EDI: EBP: ESP: bfd45020 > [ 382.231889] DS: 007b ES: 007b FS: GS: 0033 SS: 007b > [ 382.232281] softirq: huh, entered softirq 9 RCU c10b4d90 with > preempt_count 0100, exited with ? > > Signed-off-by: Vincent Pelletier > --- > drivers/usb/gadget/function/f_fs.c | 26 ++ > 1 file changed, 18 insertions(+), 8 deletions(-) > > diff --git a/drivers/usb/gadget/function/f_fs.c > b/drivers/usb/gadget/function/f_fs.c > index 199d25700050..01841fdb3144 100644 > --- a/drivers/usb/gadget/function/f_fs.c > +++ b/drivers/usb/gadget/function/f_fs.c > @@ -215,6 +215,7 @@ struct ffs_io_data { > > struct mm_struct *mm; > struct work_struct work; > + struct work_struct cancellation_work; > > struct usb_ep *ep; > struct usb_request *req; > @@ -1072,22 +1073,31 @@ ffs_epfile_open(struct inode *inode, struct file > *file) > return 0; > } > > +static void ffs_aio_cancel_worker(struct work_struct *work) > +{ > + struct ffs_io_data *io_data = container_of(work, struct ffs_io_data, > + cancellation_work); > + > + ENTER(); > + > + usb_ep_dequeue(io_data->ep, io_data->req); > +}
Re: Regarding usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hello, On Sat, 2 Jun 2018 01:02:24 +0300, Sam Protsenko wrote: > Hi Vincent, > > The issue you mentioned in your RFC patch is also reproducing on TI > X15 board (dwc3) with Android master, when doing "adb root" command. > Your patch fixes it just fine, for which I really thankful. Thanks a lot for digging the archives and finding my patch, and happy it could help ! > Do you know is there anything preventing it to be merged? I have X15 > (dwc3) and BeagleBone Black (I guess it has MUSB UDC controller on > it) which I can run some tests on, if it's a blocker for merge. Looking back at the mails I sent, I realise I did not resend the patch *without* an [RFC] subject prefix and without sign-off. My very bad. On the testing side, any testing is very welcome: I could test it on a DWC3 (intel edison) and DWC2 (raspberry pi zero), and have not heard of other tests until your mail. I reapplied my patch over Torvalds' master, cleaned the commit message and added a sign-off. I CC'ed you on the patch submission in case you want to add "tested-by". Regards, -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
This bug happens only when the UDC needs to sleep during usb_ep_dequeue, as is the case for (at least) dwc3. [ 382.200896] BUG: scheduling while atomic: screen/1808/0x0100 [ 382.207124] 4 locks held by screen/1808: [ 382.211266] #0: (rcu_callback){}, at: [] rcu_process_callbacks+0x260/0x440 [ 382.219949] #1: (rcu_read_lock_sched){}, at: [] percpu_ref_switch_to_atomic_rcu+0xb0/0x130 [ 382.230034] #2: (&(>ctx_lock)->rlock){}, at: [] free_ioctx_users+0x23/0xd0 [ 382.230096] #3: (&(>eps_lock)->rlock){}, at: [] ffs_aio_cancel+0x20/0x60 [usb_f_fs] [ 382.230160] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio bluetooth ecdh_generic brcmfmac brcmutil intel_powerclamp coretemp dwc3 kvm_intel ulpi udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc dwc3_pci aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd gpio_keys usbcore basincove_gpadc industrialio usb_common [ 382.230407] CPU: 1 PID: 1808 Comm: screen Not tainted 4.14.0-edison+ #117 [ 382.230416] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS 542 2015.01.21:18.19.48 [ 382.230425] Call Trace: [ 382.230438] [ 382.230466] dump_stack+0x47/0x62 [ 382.230498] __schedule_bug+0x61/0x80 [ 382.230522] __schedule+0x43/0x7a0 [ 382.230587] schedule+0x5f/0x70 [ 382.230625] dwc3_gadget_ep_dequeue+0x14c/0x270 [dwc3] [ 382.230669] ? do_wait_intr_irq+0x70/0x70 [ 382.230724] usb_ep_dequeue+0x19/0x90 [udc_core] [ 382.230770] ffs_aio_cancel+0x37/0x60 [usb_f_fs] [ 382.230798] kiocb_cancel+0x31/0x40 [ 382.230822] free_ioctx_users+0x4d/0xd0 [ 382.230858] percpu_ref_switch_to_atomic_rcu+0x10a/0x130 [ 382.230881] ? percpu_ref_exit+0x40/0x40 [ 382.230904] rcu_process_callbacks+0x2b3/0x440 [ 382.230965] __do_softirq+0xf8/0x26b [ 382.231011] ? __softirqentry_text_start+0x8/0x8 [ 382.231033] do_softirq_own_stack+0x22/0x30 [ 382.231042] [ 382.231071] irq_exit+0x45/0xc0 [ 382.231089] smp_apic_timer_interrupt+0x13c/0x150 [ 382.231118] apic_timer_interrupt+0x35/0x3c [ 382.231132] EIP: __copy_user_ll+0xe2/0xf0 [ 382.231142] EFLAGS: 00210293 CPU: 1 [ 382.231154] EAX: bfd4508c EBX: 0004 ECX: 0003 EDX: f3d8fe50 [ 382.231165] ESI: f3d8fe51 EDI: bfd4508d EBP: f3d8fe14 ESP: f3d8fe08 [ 382.231176] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 382.231265] core_sys_select+0x25f/0x320 [ 382.231346] ? __wake_up_common_lock+0x62/0x80 [ 382.231399] ? tty_ldisc_deref+0x13/0x20 [ 382.231438] ? ldsem_up_read+0x1b/0x40 [ 382.231459] ? tty_ldisc_deref+0x13/0x20 [ 382.231479] ? tty_write+0x29f/0x2e0 [ 382.231514] ? n_tty_ioctl+0xe0/0xe0 [ 382.231541] ? tty_write_unlock+0x30/0x30 [ 382.231566] ? __vfs_write+0x22/0x110 [ 382.231604] ? security_file_permission+0x2f/0xd0 [ 382.231635] ? rw_verify_area+0xac/0x120 [ 382.231677] ? vfs_write+0x103/0x180 [ 382.231711] SyS_select+0x87/0xc0 [ 382.231739] ? SyS_write+0x42/0x90 [ 382.231781] do_fast_syscall_32+0xd6/0x1a0 [ 382.231836] entry_SYSENTER_32+0x47/0x71 [ 382.231848] EIP: 0xb7f75b05 [ 382.231857] EFLAGS: 0246 CPU: 1 [ 382.231868] EAX: ffda EBX: 0400 ECX: bfd4508c EDX: bfd4510c [ 382.231878] ESI: EDI: EBP: ESP: bfd45020 [ 382.231889] DS: 007b ES: 007b FS: GS: 0033 SS: 007b [ 382.232281] softirq: huh, entered softirq 9 RCU c10b4d90 with preempt_count 0100, exited with ? Signed-off-by: Vincent Pelletier --- drivers/usb/gadget/function/f_fs.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 199d25700050..01841fdb3144 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -215,6 +215,7 @@ struct ffs_io_data { struct mm_struct *mm; struct work_struct work; + struct work_struct cancellation_work; struct usb_ep *ep; struct usb_request *req; @@ -1072,22 +1073,31 @@ ffs_epfile_open(struct inode *inode, struct file *file) return 0; } +static void ffs_aio_cancel_worker(struct work_struct *work) +{ + struct ffs_io_data *io_data = container_of(work, struct ffs_io_data, + cancellation_work); + + ENTER(); + + usb_ep_dequeue(io_data->ep, io_data->req); +} + static int ffs_aio_cancel(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; - struct ffs_epfile *epfile = kiocb->ki_filp->private_data; + struct ffs_data *ffs = io_data->ffs; int value; ENTER(); - spin_lock_irq(>ffs->eps_lock); - - if (likely(io_data && io_data->ep && io_data->req)) - value = usb_ep_dequeue(io_data->ep, io_data->req); - else + if (likely(io_data && io_data->ep && io_data->req)) { + INIT_WORK(_data->cancellation_work, ffs_aio_cancel_worker); +
Regarding usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hi Vincent, The issue you mentioned in your RFC patch is also reproducing on TI X15 board (dwc3) with Android master, when doing "adb root" command. Your patch fixes it just fine, for which I really thankful. Do you know is there anything preventing it to be merged? I have X15 (dwc3) and BeagleBone Black (I guess it has MUSB UDC controller on it) which I can run some tests on, if it's a blocker for merge. Thanks! P.S. Sorry to not respond in existing thread, I wasn't subscribed to linux-usb before. -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hello, On Sun, 7 Jan 2018 12:48:35 +, Vincent Pelletierwrote: > Some update: this patch looks good on DWC3: both unplugging and > unbinding the UDC while AOI transfers are queued works fine. > > What I intended to do is to also test it on DWC2, so I bought a > raspberry pi zero W (ie, with embedded wifi chip on SDIO), but the fail > is strong with me and I just cannot get a custom kernel build to boot > (blocks right after detecting mmc0 & mmc1 and listing MMC partitions), > so I could not extend my tests yet. I'll keep trying, but testers with > a DWC2 or other UDC are very welcome to give that patch a try. I could get around to build a kernel on the raspberry pi zero, rpi-4.16.y as of 6793af87063ddb793c9b40c77a0a70bad09375c7 on https://github.com/raspberrypi/linux with proposed patch applied. My test scenario passed with dwc2 module (dwc_otg, the default off-tree driver on raspberry-pi kernels, is host-only). There are a few caveat though, which I believe are unrelated to this patch: - upon first binding the UDC to the device, there is a WARNING (see the end of this mail) which seems to be a known issue: https://lkml.org/lkml/2017/2/24/723 In ly case, this does not seem to prevent the UDC from working. - upon unplugging USB, the function does not receive an onDisable event. I believe this is because of how the raspberry pi zero board is wired: USB power bus is the board main 5V power bus (shorted to the power-only micro usb plug), so to keep the raspberry pi powered on despite unplugging the fully-wired port, Vbus stays high, so AFAIK the bus is in a state indistinguishable from HSidle. It is only when re-plugging that onDisable fires (because of re-enumeration ?), and the gadget does not seem to recover (no onEnable happen). It is after I unbind the UDC (to do things cleanly), exit the function process, start it again and re-bind the UDC that stuff start working again (so kernel-side can recover without a reboot, there is that). I of course do get an onUnbind if I unbind the UDC while the function implementation is running. No issue when doing this (especially, no AIO-related issue). No issue either when killing function program while bound to UDC and with in-flight AIO requests. While I would like to test this code with more UDCs, I am not sure I can afford to buy development boards (or otherwise linux-friendly boards involving such chips), build linux for each and do comprehensive testing. Should I resubmit my patch ? It applies cleanly on linus master as of f3afe530d644488a074291da04a69a296ab63046 Merge branch 'fixes-v4.16-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security Finally, the WARNING mentioned above, surrounded by UDC binding logs to illustrate the overall sequence: Mar 02 00:57:54 sushi kernel: dwc2 2098.usb: bound driver configfs-gadget Mar 02 00:57:54 sushi kernel: [ cut here ] Mar 02 00:57:55 sushi kernel: WARNING: CPU: 0 PID: 911 at drivers/usb/dwc2/gadget.c:289 dwc2_hsotg_init_fifo+0xf0/0x1d0 [dwc2] Mar 02 00:57:55 sushi kernel: insufficient fifo memory Mar 02 00:57:55 sushi kernel: Modules linked in: usb_f_fs libcomposite bnep hci_uart btbcm bluetooth ecdh_generic brcmfmac brcmutil sha256_generic cfg80211 snd_bcm2835(C) rfkill snd_pcm snd_timer snd dwc2 udc_core bcm2835_gpiomem uio_pdrv_genirq uio i2c_dev fuse ipv6 Mar 02 00:57:55 sushi kernel: CPU: 0 PID: 911 Comm: bash Tainted: G C 4.16.0-rc3+ #11 Mar 02 00:57:55 sushi kernel: Hardware name: BCM2835 Mar 02 00:57:55 sushi kernel: [] (unwind_backtrace) from [] (show_stack+0x20/0x24) Mar 02 00:57:55 sushi kernel: [] (show_stack) from [] (dump_stack+0x20/0x28) Mar 02 00:57:55 sushi kernel: [] (dump_stack) from [] (__warn+0xe0/0x108) Mar 02 00:57:55 sushi kernel: [] (__warn) from [] (warn_slowpath_fmt+0x44/0x4c) Mar 02 00:57:55 sushi kernel: [] (warn_slowpath_fmt) from [] (dwc2_hsotg_init_fifo+0xf0/0x1d0 [dwc2]) Mar 02 00:57:55 sushi kernel: [] (dwc2_hsotg_init_fifo [dwc2]) from [] (dwc2_hsotg_core_init_disconnected+0x88/0x3e4 [dwc2]) Mar 02 00:57:55 sushi kernel: [] (dwc2_hsotg_core_init_disconnected [dwc2]) from [] (dwc2_hsotg_pullup+0x108/0x14c [dwc2]) Mar 02 00:57:55 sushi kernel: [] (dwc2_hsotg_pullup [dwc2]) from [] (usb_gadget_connect+0x48/0xe0 [udc_core]) Mar 02 00:57:55 sushi kernel: [] (usb_gadget_connect [udc_core]) from [] (udc_bind_to_driver+0x100/0x108 [udc_core]) Mar 02 00:57:55 sushi kernel: [] (udc_bind_to_driver [udc_core]) from [] (usb_gadget_probe_driver+0xb0/0x168 [udc_core]) Mar 02 00:57:55 sushi kernel: [] (usb_gadget_probe_driver [udc_core]) from [] (gadget_dev_desc_UDC_store+0xc0/0xdc [libcomposite]) Mar 02 00:57:55 sushi kernel: [] (gadget_dev_desc_UDC_store [libcomposite]) from [] (configfs_write_file+0xe0/0x170) Mar 02 00:57:55 sushi kernel: [] (configfs_write_file) from [] (__vfs_write+0x38/0x130) Mar 02 00:57:55 sushi kernel: []
Re: [RFC] Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On Fri, 1 Dec 2017 00:14:52 +, Vincent Pelletierwrote: > I'll play some more with a kernel with this patch > to see if it breaks, and will resubmit if it looks fine. Some update: this patch looks good on DWC3: both unplugging and unbinding the UDC while AOI transfers are queued works fine. What I intended to do is to also test it on DWC2, so I bought a raspberry pi zero W (ie, with embedded wifi chip on SDIO), but the fail is strong with me and I just cannot get a custom kernel build to boot (blocks right after detecting mmc0 & mmc1 and listing MMC partitions), so I could not extend my tests yet. I'll keep trying, but testers with a DWC2 or other UDC are very welcome to give that patch a try. To help with tests, here is my test scenario: On both sides: $ git clone https://github.com/vpelletier/python-functionfs.git $ mkdir venv $ virtualenv venv/ $ cd python-functionfs/ $ ../venv/bin/python setup.py install On gadget only: $ cd .. $ sudo aptitude install libaio1 $ git clone https://github.com/vpelletier/python-libaio.git $ cd python-libaio/ $ ../venv/bin/python setup.py install Create a gadget device. My cheat sheet: https://github.com/vpelletier/python-functionfs/tree/master/functionfs/tests On gadget: $ python -u slowprinter.py | venv/bin/python -u device.py /path/to/ffs_mountpoint On host (re-run after disconnects, scripts come from python-functionfs repository): $ python -u slowprinter.py | venv/bin/python -u host.py Plus udev rules to grant current host user access to the usb device. Each side should print the time from other side every second, plus some tracing blather. Test passes if gadget kernel is happy after multiple unplug/plug cycles and both ends can still communicate after re-plugging and restarting host script. Regards, -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
Hello, On Thu, 30 Nov 2017 13:59:30 -0500 (EST), Alan Sternwrote: > This is a little misleading. The req->complete callback will be called > whether the request is dequeued or not. The real race is between > usb_ep_dequeue and completion of the transfer. Actually, this is how I understood it. Sorry for not providing enough to make that clear (...and sorry for the terrible typos in that mail). Thanks for checking. I'll play some more with a kernel with this patch to see if it breaks, and will resubmit if it looks fine. Regards, -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
On Thu, 30 Nov 2017, Vincent Pelletier wrote: > (tagging subject more ostensibly as RFC) > > On Tue, 28 Nov 2017 16:18:36 +, Vincent Pelletier >wrote: > > - This change should have no effect on existing cancel/complete race > > condition, which I expect is already handled in AIO. If not, then it's > > yet another ffs_aio_cancel bug, and this commit will get larger... > > I think I no understand this part: it does not belong to AIO, but to > gadget, namely usb_ep_dequeue vs. dwc3_gadget_giveback. > There actually is a commentin f_fs.c about this: > > usb_ep_dequeue API should guarantee no race condition with > req->complete callback. This is a little misleading. The req->complete callback will be called whether the request is dequeued or not. The real race is between usb_ep_dequeue and completion of the transfer. If usb_ep_dequeue is called before the transfer completes then the dequeue call will succeed, req->status will contain an error value, and the host probably will see some sort of error. If usb_ep_dequeue is called after the transfer completes then the dequeue call will get an error, req->status will be 0 (unless something else went wrong), and the host will see a normal transfer. > So I think there is nothing more to do on this point. > > > - Should anything be done with the return value of usb_ep_dequeue in > > ffs_aio_cancel_worker ? Failures to cancel because transfer is already > > completed or not known should be harmless, but could there be more > > serious issues lurking ? > > From a quick read of a few UDCs, it seems the only error code (EINVAL) > typically means there is nothing to cancel, so it would be fine to > ignore it. > But I would still prefer some advice here (...in addition to the > larger questions from my original mail). The only real reason for usb_ep_dequeue to return an error would be if it was asked to dequeue a request that wasn't currently active. Either the request was never submitted or else it has already completed. Either way, there is probably not much you can do about it. Alan Stern -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC] Re: usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
(tagging subject more ostensibly as RFC) On Tue, 28 Nov 2017 16:18:36 +, Vincent Pelletierwrote: > - This change should have no effect on existing cancel/complete race > condition, which I expect is already handled in AIO. If not, then it's > yet another ffs_aio_cancel bug, and this commit will get larger... I think I no understand this part: it does not belong to AIO, but to gadget, namely usb_ep_dequeue vs. dwc3_gadget_giveback. There actually is a commentin f_fs.c about this: usb_ep_dequeue API should guarantee no race condition with req->complete callback. So I think there is nothing more to do on this point. > - Should anything be done with the return value of usb_ep_dequeue in > ffs_aio_cancel_worker ? Failures to cancel because transfer is already > completed or not known should be harmless, but could there be more > serious issues lurking ? From a quick read of a few UDCs, it seems the only error code (EINVAL) typically means there is nothing to cancel, so it would be fine to ignore it. But I would still prefer some advice here (...in addition to the larger questions from my original mail). Regards, -- Vincent Pelletier -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
usb: gadget: ffs: Fix BUG when userland exits with submitted AIO transfers
!! This is an RFC patch - hence sign-off absence !! This bug happens only when the UDC needs to sleep during usb_ep_dequeue, as is the case for (at least) dwc3. RFC: - I do not understand what eps_lock is supposed to protect in original ffs_aio_cancel implementation, especially once one considers usb_ep_dequeue should be allowed to sleep: no non-intricated locking scheme would allow this. I removed this lock, I may be doing something wrong. - Because the 3 other locks also disable interrupts, I believe the only valid fix is to move usb_ep_dequeue to another call stack. Not being sure I could safely reuse ffs_io_data.work (completion function will be called during usb_ep_dequeue call, so during ffs_aio_cancel_worker call), I added a new work_struct. I believe it is safe to deffer cancellation from io_data point of view, as ffs owns its buffer, and from aio point of view as regular io_cancel return -EINPROGRESS when kiocb_cancel succeeds, implying kiocb_cancel is not required to be fully effective synchronously. Sadly, kiocb_set_cancel_fn is only used in ffs and its legacy counterpart, so I have very little precedents to look at. read_buffer gets freed in ffs_func_eps_disable, but it does not matter for AIO, only synchronous reads use it. Are there more checks needed ? - This change should have no effect on existing cancel/complete race condition, which I expect is already handled in AIO. If not, then it's yet another ffs_aio_cancel bug, and this commit will get larger... - Should anything be done with the return value of usb_ep_dequeue in ffs_aio_cancel_worker ? Failures to cancel because transfer is already completed or not known should be harmless, but could there be more serious issues lurking ? [ 382.200896] BUG: scheduling while atomic: screen/1808/0x0100 [ 382.207124] 4 locks held by screen/1808: [ 382.211266] #0: (rcu_callback){}, at: [] rcu_process_callbacks+0x260/0x440 [ 382.219949] #1: (rcu_read_lock_sched){}, at: [] percpu_ref_switch_to_atomic_rcu+0xb0/0x130 [ 382.230034] #2: (&(>ctx_lock)->rlock){}, at: [] free_ioctx_users+0x23/0xd0 [ 382.230096] #3: (&(>eps_lock)->rlock){}, at: [] ffs_aio_cancel+0x20/0x60 [usb_f_fs] [ 382.230160] Modules linked in: usb_f_fs libcomposite configfs bnep btsdio bluetooth ecdh_generic brcmfmac brcmutil intel_powerclamp coretemp dwc3 kvm_intel ulpi udc_core kvm irqbypass crc32_pclmul crc32c_intel pcbc dwc3_pci aesni_intel aes_i586 crypto_simd cryptd ehci_pci ehci_hcd gpio_keys usbcore basincove_gpadc industrialio usb_common [ 382.230407] CPU: 1 PID: 1808 Comm: screen Not tainted 4.14.0-edison+ #117 [ 382.230416] Hardware name: Intel Corporation Merrifield/BODEGA BAY, BIOS 542 2015.01.21:18.19.48 [ 382.230425] Call Trace: [ 382.230438] [ 382.230466] dump_stack+0x47/0x62 [ 382.230498] __schedule_bug+0x61/0x80 [ 382.230522] __schedule+0x43/0x7a0 [ 382.230587] schedule+0x5f/0x70 [ 382.230625] dwc3_gadget_ep_dequeue+0x14c/0x270 [dwc3] [ 382.230669] ? do_wait_intr_irq+0x70/0x70 [ 382.230724] usb_ep_dequeue+0x19/0x90 [udc_core] [ 382.230770] ffs_aio_cancel+0x37/0x60 [usb_f_fs] [ 382.230798] kiocb_cancel+0x31/0x40 [ 382.230822] free_ioctx_users+0x4d/0xd0 [ 382.230858] percpu_ref_switch_to_atomic_rcu+0x10a/0x130 [ 382.230881] ? percpu_ref_exit+0x40/0x40 [ 382.230904] rcu_process_callbacks+0x2b3/0x440 [ 382.230965] __do_softirq+0xf8/0x26b [ 382.231011] ? __softirqentry_text_start+0x8/0x8 [ 382.231033] do_softirq_own_stack+0x22/0x30 [ 382.231042] [ 382.231071] irq_exit+0x45/0xc0 [ 382.231089] smp_apic_timer_interrupt+0x13c/0x150 [ 382.231118] apic_timer_interrupt+0x35/0x3c [ 382.231132] EIP: __copy_user_ll+0xe2/0xf0 [ 382.231142] EFLAGS: 00210293 CPU: 1 [ 382.231154] EAX: bfd4508c EBX: 0004 ECX: 0003 EDX: f3d8fe50 [ 382.231165] ESI: f3d8fe51 EDI: bfd4508d EBP: f3d8fe14 ESP: f3d8fe08 [ 382.231176] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 382.231265] core_sys_select+0x25f/0x320 [ 382.231346] ? __wake_up_common_lock+0x62/0x80 [ 382.231399] ? tty_ldisc_deref+0x13/0x20 [ 382.231438] ? ldsem_up_read+0x1b/0x40 [ 382.231459] ? tty_ldisc_deref+0x13/0x20 [ 382.231479] ? tty_write+0x29f/0x2e0 [ 382.231514] ? n_tty_ioctl+0xe0/0xe0 [ 382.231541] ? tty_write_unlock+0x30/0x30 [ 382.231566] ? __vfs_write+0x22/0x110 [ 382.231604] ? security_file_permission+0x2f/0xd0 [ 382.231635] ? rw_verify_area+0xac/0x120 [ 382.231677] ? vfs_write+0x103/0x180 [ 382.231711] SyS_select+0x87/0xc0 [ 382.231739] ? SyS_write+0x42/0x90 [ 382.231781] do_fast_syscall_32+0xd6/0x1a0 [ 382.231836] entry_SYSENTER_32+0x47/0x71 [ 382.231848] EIP: 0xb7f75b05 [ 382.231857] EFLAGS: 0246 CPU: 1 [ 382.231868] EAX: ffda EBX: 0400 ECX: bfd4508c EDX: bfd4510c [ 382.231878] ESI: EDI: EBP: ESP: bfd45020 [ 382.231889] DS: 007b ES: 007b FS: GS: 0033 SS: 007b [ 382.232281] softirq: huh, entered