On Thu, 2018-10-25 at 10:11 -0700, Bart Van Assche wrote:
> On Thu, 2018-10-25 at 19:02 +0200, Johannes Berg wrote:
> > On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:
> > > It can happen that the direct I/O queue creates and destroys an empty
> > > workqueue from inside a work function.
> > 
> > So, thinking about this more, can you guarantee (somehow) that the
> > workqueue is empty at this point?
> 
> In general, no. But for the direct I/O case this can be guaranteed. Please
> have a look at the code in sb_init_dio_done_wq() if you would not yet have
> done this.

Indeed, obviously.

> > Do you know how to reproduce this?
> 
> The lockdep complaint in the patch description is easy to reproduce. The
> way I reproduce it is as follows:
> 
> git clone https://github.com/osandov/blktests
> (cd blktests && ./check -q nvmeof-mp)

I'm a bit scared by this, looks like that needs a lot of prerequisites?
I'll try it in a VM when it's done compiling with nvme.


Looking at the splat in more detail, I think we have the following:

__generic_file_fsync() takes i_mutex_key#14

__generic_file_fsync() this is called from dio_aio_complete_work()

dio_aio_complete_work() generally runs on the dio/%s workqueue



Lockdep also sees:

ext4_file_write_iter() takes i_mutex_key#14

depending on circumstances, it can then call do_blockdev_direct_IO(),
which needs to ensure the WQ exists for this SB, so it calls
sb_init_dio_done_wq() to allocate the dio/%s workqueue.



Since lockdep neither knows that the instance of the workqueue that was
executing dio_aio_complete_work() must be different from the instance
that's freed, it complains because this creates a circle of
dependencies.

Still, basically what I tried to say before - rather than track whether
a workqueue was ever used, which is error-prone since in other cases in
the kernel the usage might depend on whatever conditions, I think we
should either teach lockdep that this is guaranteed to be a different
workqueue, or perhaps we should just have a "free empty workqueue"
function. I tend to prefer the former as it's more general, so I'd
propose this (combined) patch:

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 093fb54cd316..9ef33d6cba56 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -629,9 +629,16 @@ int sb_init_dio_done_wq(struct super_block *sb)
         * This has to be atomic as more DIOs can race to create the workqueue
         */
        old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
-       /* Someone created workqueue before us? Free ours... */
+       /*
+        * Someone created workqueue before us? Free ours...
+        * Note the _nested(), that pushes down to the (in this case actually
+        * pointless) flush_workqueue() happening inside, since this function
+        * might be called in contexts that hold the same locks that an fs may
+        * take while being called from dio_aio_complete_work() from another
+        * instance of the workqueue we allocate here.
+        */
        if (old)
-               destroy_workqueue(wq);
+               destroy_workqueue_nested(wq, SINGLE_DEPTH_NESTING);
        return 0;
 }
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 60d673e15632..0b36a7df61d4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -453,7 +453,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, 
int max_active,
 #define create_singlethread_workqueue(name)                            \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
 
-extern void destroy_workqueue(struct workqueue_struct *wq);
+extern void destroy_workqueue_nested(struct workqueue_struct *wq, int 
subclass);
+
+static inline void destroy_workqueue(struct workqueue_struct *wq)
+{
+       destroy_workqueue_nested(wq, 0);
+}
 
 struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
@@ -469,8 +474,18 @@ extern bool mod_delayed_work_on(int cpu, struct 
workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
 extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work 
*rwork);
 
-extern void flush_workqueue(struct workqueue_struct *wq);
-extern void drain_workqueue(struct workqueue_struct *wq);
+extern void flush_workqueue_nested(struct workqueue_struct *wq, int subclass);
+extern void drain_workqueue_nested(struct workqueue_struct *wq, int subclass);
+
+static inline void flush_workqueue(struct workqueue_struct *wq)
+{
+       flush_workqueue_nested(wq, 0);
+}
+
+static inline void drain_workqueue(struct workqueue_struct *wq)
+{
+       drain_workqueue_nested(wq, 0);
+}
 
 extern int schedule_on_each_cpu(work_func_t func);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0280deac392e..6b00e062af96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2634,13 +2634,14 @@ static bool flush_workqueue_prep_pwqs(struct 
workqueue_struct *wq,
 }
 
 /**
- * flush_workqueue - ensure that any scheduled work has run to completion.
+ * flush_workqueue_nested - ensure that any scheduled work has run to 
completion.
  * @wq: workqueue to flush
+ * @subclass: subclass for lockdep
  *
  * This function sleeps until all work items which were queued on entry
  * have finished execution, but it is not livelocked by new incoming ones.
  */
-void flush_workqueue(struct workqueue_struct *wq)
+void flush_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
@@ -2652,7 +2653,7 @@ void flush_workqueue(struct workqueue_struct *wq)
        if (WARN_ON(!wq_online))
                return;
 
-       lock_map_acquire(&wq->lockdep_map);
+       lock_acquire_exclusive(&wq->lockdep_map, subclass, 0, NULL, _THIS_IP_);
        lock_map_release(&wq->lockdep_map);
 
        mutex_lock(&wq->mutex);
@@ -2789,11 +2790,12 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
        mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue_nested);
 
 /**
- * drain_workqueue - drain a workqueue
+ * drain_workqueue_nested - drain a workqueue
  * @wq: workqueue to drain
+ * @subclass: lockdep subclass
  *
  * Wait until the workqueue becomes empty.  While draining is in progress,
  * only chain queueing is allowed.  IOW, only currently pending or running
@@ -2802,7 +2804,7 @@ EXPORT_SYMBOL(flush_workqueue);
  * by the depth of chaining and should be relatively short.  Whine if it
  * takes too long.
  */
-void drain_workqueue(struct workqueue_struct *wq)
+void drain_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;
@@ -2817,7 +2819,7 @@ void drain_workqueue(struct workqueue_struct *wq)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
 reflush:
-       flush_workqueue(wq);
+       flush_workqueue_nested(wq, subclass);
 
        mutex_lock(&wq->mutex);
 
@@ -2844,7 +2846,7 @@ void drain_workqueue(struct workqueue_struct *wq)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL_GPL(drain_workqueue);
+EXPORT_SYMBOL_GPL(drain_workqueue_nested);
 
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
@@ -4141,18 +4143,19 @@ struct workqueue_struct *__alloc_workqueue_key(const 
char *fmt,
 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
 /**
- * destroy_workqueue - safely terminate a workqueue
+ * destroy_workqueue_nested - safely terminate a workqueue
  * @wq: target workqueue
+ * @subclass: lockdep subclass
  *
  * Safely destroy a workqueue. All work currently pending will be done first.
  */
-void destroy_workqueue(struct workqueue_struct *wq)
+void destroy_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
        struct pool_workqueue *pwq;
        int node;
 
        /* drain it before proceeding with destruction */
-       drain_workqueue(wq);
+       drain_workqueue_nested(wq, subclass);
 
        /* sanity checks */
        mutex_lock(&wq->mutex);
@@ -4217,7 +4220,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
                put_pwq_unlocked(pwq);
        }
 }
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL_GPL(destroy_workqueue_nested);
 
 /**
  * workqueue_set_max_active - adjust max_active of a workqueue


We could avoid the useless subclass argument in the non-lockdep case
with some macro trickery, but for now I haven't done that.

johannes

Reply via email to