From: Dave Hansen <dave.han...@linux.intel.com>

Currently, __sb_start_write() and freeze_super() can race with
each other.  __sb_start_write() uses a smp_mb() to ensure that
freeze_super() can see its write to sb->s_writers.counter and
that it can see freeze_super()'s update to sb->s_writers.frozen.
This all seems to work fine.

But, this smp_mb() makes __sb_start_write() the single hottest
function in the kernel if I sit in a loop and do tiny write()s to
tmpfs over and over.  This is on a very small 2-core system, so
it will only get worse on larger systems.

This _seems_ like an ideal case for RCU.  __sb_start_write() is
the RCU read-side and is in a very fast, performance-sensitive
path.  freeze_super() is the RCU writer and is in an extremely
rare non-performance-sensitive path.

Instead of doing and smp_wmb() in __sb_start_write(), we do
rcu_read_lock().  This ensures that a CPU doing freeze_super()
can not proceed past its synchronize_rcu() until the grace
period has ended and the 's_writers.frozen = SB_FREEZE_WRITE'
is visible to __sb_start_write().

One question here: Does the work that __sb_start_write() does in
a previous grace period becomes visible to freeze_super() after
its call to synchronize_rcu()?  It _seems_ like it should, but it
seems backwards to me since __sb_start_write() is the "reader" in
this case.

This patch increases the number of writes/second that I can do
by 5.6%.

Does anybody see any holes with this?

Cc: Jan Kara <j...@suse.cz>
Cc: Alexander Viro <v...@zeniv.linux.org.uk>
Cc: linux-fsde...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Andi Kleen <a...@linux.intel.com>
Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
---

 b/fs/super.c |   63 ++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff -puN fs/super.c~rcu-__sb_start_write fs/super.c
--- a/fs/super.c~rcu-__sb_start_write   2015-06-24 17:14:34.939125713 -0700
+++ b/fs/super.c        2015-06-24 17:14:34.942125847 -0700
@@ -1190,27 +1190,21 @@ static void acquire_freeze_lock(struct s
  */
 int __sb_start_write(struct super_block *sb, int level, bool wait)
 {
-retry:
-       if (unlikely(sb->s_writers.frozen >= level)) {
+       rcu_read_lock();
+       while (unlikely(sb->s_writers.frozen >= level)) {
+               rcu_read_unlock();
                if (!wait)
                        return 0;
                wait_event(sb->s_writers.wait_unfrozen,
                           sb->s_writers.frozen < level);
+               rcu_read_lock();
        }
 
 #ifdef CONFIG_LOCKDEP
        acquire_freeze_lock(sb, level, !wait, _RET_IP_);
 #endif
        percpu_counter_inc(&sb->s_writers.counter[level-1]);
-       /*
-        * Make sure counter is updated before we check for frozen.
-        * freeze_super() first sets frozen and then checks the counter.
-        */
-       smp_mb();
-       if (unlikely(sb->s_writers.frozen >= level)) {
-               __sb_end_write(sb, level);
-               goto retry;
-       }
+       rcu_read_unlock();
        return 1;
 }
 EXPORT_SYMBOL(__sb_start_write);
@@ -1254,6 +1248,29 @@ static void sb_wait_write(struct super_b
        } while (writers);
 }
 
+static void __thaw_super(struct super_block *sb)
+{
+       sb->s_writers.frozen = SB_UNFROZEN;
+       /*
+        * RCU protects us against races where we are taking
+        * s_writers.frozen in to a less permissive state.  When
+        * that happens, __sb_start_write() might not yet have
+        * seen our write and might still increment
+        * s_writers.counter.
+        *
+        * Here, however, we are transitioning to a _more_
+        * permissive state.  The filesystem is frozen and no
+        * writes to s_writers.counter are being permitted.
+        *
+        * A smp_wmb() is sufficient here because we just need
+        * to ensure that new calls __sb_start_write() are
+        * allowed, not that _concurrent_ calls have finished.
+        */
+       smp_wmb();
+       wake_up(&sb->s_writers.wait_unfrozen);
+       deactivate_locked_super(sb);
+}
+
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1312,7 +1329,13 @@ int freeze_super(struct super_block *sb)
 
        /* From now on, no new normal writers can start */
        sb->s_writers.frozen = SB_FREEZE_WRITE;
-       smp_wmb();
+       /*
+        * After we synchronize_rcu(), we have ensured that everyone
+        * who reads sb->s_writers.frozen under rcu_read_lock() can
+        * now see our update.  This pretty much means that
+        * __sb_start_write() will not allow any new writers.
+        */
+       synchronize_rcu();
 
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        up_write(&sb->s_umount);
@@ -1322,7 +1345,7 @@ int freeze_super(struct super_block *sb)
        /* Now we go and block page faults... */
        down_write(&sb->s_umount);
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
-       smp_wmb();
+       synchronize_rcu();
 
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
 
@@ -1331,7 +1354,7 @@ int freeze_super(struct super_block *sb)
 
        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
-       smp_wmb();
+       synchronize_rcu();
        sb_wait_write(sb, SB_FREEZE_FS);
 
        if (sb->s_op->freeze_fs) {
@@ -1339,11 +1362,7 @@ int freeze_super(struct super_block *sb)
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
-                       sb->s_writers.frozen = SB_UNFROZEN;
-                       smp_wmb();
-                       wake_up(&sb->s_writers.wait_unfrozen);
-                       deactivate_locked_super(sb);
-                       return ret;
+                       __thaw_super(sb);
                }
        }
        /*
@@ -1386,11 +1405,7 @@ int thaw_super(struct super_block *sb)
        }
 
 out:
-       sb->s_writers.frozen = SB_UNFROZEN;
-       smp_wmb();
-       wake_up(&sb->s_writers.wait_unfrozen);
-       deactivate_locked_super(sb);
-
+       __thaw_super(sb);
        return 0;
 }
 EXPORT_SYMBOL(thaw_super);
_
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to