[PATCH v2 2/5] fuse: Optimize request_end() by not taking fiq->waitq.lock

2018-11-08 Thread Kirill Tkhai
We take global fiq->waitq.lock every time, when we are
in this function, but interrupted requests are just small
subset of all requests. This patch optimizes request_end()
and makes it to take the lock when it's really needed.

queue_interrupt() needs small change for that. After req
is linked to interrupt list, we do smp_mb() and check
for FR_FINISHED again. In case of FR_FINISHED bit has
appeared, we remove req and leave the function:

CPU 0CPU 1
queue_interrupt()request_end()

  spin_lock(>waitq.lock)


  list_add_tail(>intr_entry, >interrupts)
test_and_set_bit(FR_FINISHED, >flags)
  smp_mb() 
  if (test_bit(FR_FINISHED, >flags))  if 
(!list_empty(>intr_entry))

list_del_init(>intr_entry)  
spin_lock(>waitq.lock)
 
list_del_init(>intr_entry)

Check the change is visible in perf report:

1)Firstly mount fusexmp_fh:
  $fuse/example/.libs/fusexmp_fh mnt

2)Run test doing
futimes(fd, tv1);
futimes(fd, tv2);
  in many threads endlessly.

3)perf record -g (all the system load)

Without the patch in request_end() we spend 62.58% of do_write() time:
(= 12.58 / 20.10 * 100%)

   55,22% entry_SYSCALL_64
 20,10% do_writev
  ...
  18,08% fuse_dev_do_write
   12,58% request_end
10,08% __wake_up_common_lock
1,97% queued_spin_lock_slowpath
   1,31% fuse_copy_args
   1,04% fuse_copy_one
   0,85% queued_spin_lock_slowpath

With the patch, the perf report becomes better, and only 58.16%
of do_write() time we spend in request_end():

   54,15% entry_SYSCALL_64
 18,24% do_writev
  ...
  16,25% fuse_dev_do_write
   10,61% request_end
10,25% __wake_up_common_lock
   1,34% fuse_copy_args
   1,06% fuse_copy_one
   0,88% queued_spin_lock_slowpath

v2: Remove unlocked test of FR_FINISHED from queue_interrupt()

Signed-off-by: Kirill Tkhai 
---
 fs/fuse/dev.c |   28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1ecec7fcb841..7374a23b1bc8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -427,10 +427,16 @@ static void request_end(struct fuse_conn *fc, struct 
fuse_req *req)
 
if (test_and_set_bit(FR_FINISHED, >flags))
goto put_request;
-
-   spin_lock(>waitq.lock);
-   list_del_init(>intr_entry);
-   spin_unlock(>waitq.lock);
+   /*
+* test_and_set_bit() implies smp_mb() between bit
+* changing and below intr_entry check. Pairs with
+* smp_mb() from queue_interrupt().
+*/
+   if (!list_empty(>intr_entry)) {
+   spin_lock(>waitq.lock);
+   list_del_init(>intr_entry);
+   spin_unlock(>waitq.lock);
+   }
WARN_ON(test_bit(FR_PENDING, >flags));
WARN_ON(test_bit(FR_SENT, >flags));
if (test_bit(FR_BACKGROUND, >flags)) {
@@ -469,12 +475,18 @@ static void request_end(struct fuse_conn *fc, struct 
fuse_req *req)
 static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
spin_lock(>waitq.lock);
-   if (test_bit(FR_FINISHED, >flags)) {
-   spin_unlock(>waitq.lock);
-   return;
-   }
if (list_empty(>intr_entry)) {
list_add_tail(>intr_entry, >interrupts);
+   /*
+* Pairs with smp_mb() implied by test_and_set_bit()
+* from request_end().
+*/
+   smp_mb();
+   if (test_bit(FR_FINISHED, >flags)) {
+   list_del_init(>intr_entry);
+   spin_unlock(>waitq.lock);
+   return;
+   }
wake_up_locked(>waitq);
kill_fasync(>fasync, SIGIO, POLL_IN);
}



[PATCH v2 2/5] fuse: Optimize request_end() by not taking fiq->waitq.lock

2018-11-08 Thread Kirill Tkhai
We take global fiq->waitq.lock every time, when we are
in this function, but interrupted requests are just small
subset of all requests. This patch optimizes request_end()
and makes it to take the lock when it's really needed.

queue_interrupt() needs small change for that. After req
is linked to interrupt list, we do smp_mb() and check
for FR_FINISHED again. In case of FR_FINISHED bit has
appeared, we remove req and leave the function:

CPU 0CPU 1
queue_interrupt()request_end()

  spin_lock(>waitq.lock)


  list_add_tail(>intr_entry, >interrupts)
test_and_set_bit(FR_FINISHED, >flags)
  smp_mb() 
  if (test_bit(FR_FINISHED, >flags))  if 
(!list_empty(>intr_entry))

list_del_init(>intr_entry)  
spin_lock(>waitq.lock)
 
list_del_init(>intr_entry)

Check the change is visible in perf report:

1)Firstly mount fusexmp_fh:
  $fuse/example/.libs/fusexmp_fh mnt

2)Run test doing
futimes(fd, tv1);
futimes(fd, tv2);
  in many threads endlessly.

3)perf record -g (all the system load)

Without the patch in request_end() we spend 62.58% of do_write() time:
(= 12.58 / 20.10 * 100%)

   55,22% entry_SYSCALL_64
 20,10% do_writev
  ...
  18,08% fuse_dev_do_write
   12,58% request_end
10,08% __wake_up_common_lock
1,97% queued_spin_lock_slowpath
   1,31% fuse_copy_args
   1,04% fuse_copy_one
   0,85% queued_spin_lock_slowpath

With the patch, the perf report becomes better, and only 58.16%
of do_write() time we spend in request_end():

   54,15% entry_SYSCALL_64
 18,24% do_writev
  ...
  16,25% fuse_dev_do_write
   10,61% request_end
10,25% __wake_up_common_lock
   1,34% fuse_copy_args
   1,06% fuse_copy_one
   0,88% queued_spin_lock_slowpath

v2: Remove unlocked test of FR_FINISHED from queue_interrupt()

Signed-off-by: Kirill Tkhai 
---
 fs/fuse/dev.c |   28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1ecec7fcb841..7374a23b1bc8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -427,10 +427,16 @@ static void request_end(struct fuse_conn *fc, struct 
fuse_req *req)
 
if (test_and_set_bit(FR_FINISHED, >flags))
goto put_request;
-
-   spin_lock(>waitq.lock);
-   list_del_init(>intr_entry);
-   spin_unlock(>waitq.lock);
+   /*
+* test_and_set_bit() implies smp_mb() between bit
+* changing and below intr_entry check. Pairs with
+* smp_mb() from queue_interrupt().
+*/
+   if (!list_empty(>intr_entry)) {
+   spin_lock(>waitq.lock);
+   list_del_init(>intr_entry);
+   spin_unlock(>waitq.lock);
+   }
WARN_ON(test_bit(FR_PENDING, >flags));
WARN_ON(test_bit(FR_SENT, >flags));
if (test_bit(FR_BACKGROUND, >flags)) {
@@ -469,12 +475,18 @@ static void request_end(struct fuse_conn *fc, struct 
fuse_req *req)
 static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
spin_lock(>waitq.lock);
-   if (test_bit(FR_FINISHED, >flags)) {
-   spin_unlock(>waitq.lock);
-   return;
-   }
if (list_empty(>intr_entry)) {
list_add_tail(>intr_entry, >interrupts);
+   /*
+* Pairs with smp_mb() implied by test_and_set_bit()
+* from request_end().
+*/
+   smp_mb();
+   if (test_bit(FR_FINISHED, >flags)) {
+   list_del_init(>intr_entry);
+   spin_unlock(>waitq.lock);
+   return;
+   }
wake_up_locked(>waitq);
kill_fasync(>fasync, SIGIO, POLL_IN);
}