Dima,

I have not heard from you since 07/06/2016. Do you agree with that reasoning I provided in last email? What's your objection against the patch now?


Thanks,

Maxim


On 07/06/2016 11:10 AM, Maxim Patlasov wrote:
Dima,

On 07/06/2016 04:58 AM, Dmitry Monakhov wrote:

Maxim Patlasov <mpatla...@virtuozzo.com> writes:

Commit 9f860e606 introduced an engine to delay fsync: doing
fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks
io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens
later, when incoming FLUSH|FUA comes.

That was deemed as important because (PSBM-47026):

This optimization becomes more important due to the fact that customers tend to use pcompact heavily => ploop images grow each day.
Now, we can easily re-use the engine to delay fsync for reloc
requests as well. As explained in the description of commit
5aa3fe09:

     1->read_data_from_old_post
     2->write_to_new_pos
       ->sumbit_alloc
          ->submit_pad
      ->post_submit->convert_unwritten
     3->update_index ->write_page with FLUSH|FUA
     4->nullify_old_pos
    5->issue_flush
by the time of step 3 extent coversion is not yet stable because
belongs to uncommitted transaction. But instead of doing fsync
inside ->post_submit, we can fsync later, as the very first step
of write_page for index_update.
NAK from me. What is advantage of this patch?

The advantage is the following: in case of BAT multi-updates, instead of doing many fsync-s (one per dio_post_submit), we'll do only one (when final ->write_page is called).

Does it makes code more optimal? No

Yes, it does. In the same sense as 9f860e606: saving some fsync-s.

Does it makes main ploop more asynchronous? No.

Correct, the patch optimizes ploop in the other way. It's not about making ploop more asynchronous.



If you want to make optimization then it is reasonable to
queue preq with PLOOP_IO_FSYNC_DELAYED to top_io->fsync_queue
before processing PLOOP_E_DATA_WBI  state for  preq with FUA
So sequence will looks like follows:
->sumbit_alloc
   ->submit_pad
   ->post_submit->convert_unwritten-> tag PLOOP_IO_FSYNC_DELAYED
->ploop_req_state_process
   case PLOOP_E_DATA_WBI:
   if (preq->start & PLOOP_IO_FSYNC_DELAYED_FL) {
       preq->start &= ~PLOOP_IO_FSYNC_DELAYED_FL
       list_add_tail(&preq->list, &top_io->fsync_queue)
       return;
    }
##Let fsync_thread do it's work
->ploop_req_state_process
    case LOOP_E_DATA_WBI:
update_index->write_page with FUA (FLUSH is not required because we already done fsync)

That's another type of optimization: making ploop more asynchronous. I thought about it, but didn't come to conclusion whether it's worthy w.r.t. adding more complexity to ploop-state-machine and possible bugs introduced with that.

Thanks,
Maxim


https://jira.sw.ru/browse/PSBM-47026

Signed-off-by: Maxim Patlasov <mpatla...@virtuozzo.com>
---
  drivers/block/ploop/dev.c       |    4 ++--
  drivers/block/ploop/io_direct.c |   25 ++++++++++++++++++++++++-
  drivers/block/ploop/io_kaio.c   |    3 ++-
  drivers/block/ploop/map.c       |   17 ++++++++++++-----
  include/linux/ploop/ploop.h     |    3 ++-
  5 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index e5f010b..40768b6 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -4097,7 +4097,7 @@ static void ploop_relocate(struct ploop_device * plo)
      preq->bl.tail = preq->bl.head = NULL;
      preq->req_cluster = 0;
      preq->req_size = 0;
-    preq->req_rw = WRITE_SYNC|REQ_FUA;
+    preq->req_rw = WRITE_SYNC;
      preq->eng_state = PLOOP_E_ENTRY;
      preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
      preq->error = 0;
@@ -4401,7 +4401,7 @@ static void ploop_relocblks_process(struct ploop_device *plo)
          preq->bl.tail = preq->bl.head = NULL;
          preq->req_cluster = ~0U; /* uninitialized */
          preq->req_size = 0;
-        preq->req_rw = WRITE_SYNC|REQ_FUA;
+        preq->req_rw = WRITE_SYNC;
          preq->eng_state = PLOOP_E_ENTRY;
preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
          preq->error = 0;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 1086850..0a5fb15 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -1494,13 +1494,36 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq,
    static void
  dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-           struct page * page, sector_t sec, unsigned long rw)
+           struct page * page, sector_t sec, unsigned long rw,
+           int do_fsync_if_delayed)
  {
      if (!(io->files.file->f_mode & FMODE_WRITE)) {
          PLOOP_FAIL_REQUEST(preq, -EBADF);
          return;
      }
  +    if (do_fsync_if_delayed &&
+        test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)) {
+        struct ploop_device * plo = io->plo;
+        u64 io_count;
+        int err;
+
+        spin_lock_irq(&plo->lock);
+        io_count = io->io_count;
+        spin_unlock_irq(&plo->lock);
+
+        err = io->ops->sync(io);
+        if (err) {
+            PLOOP_FAIL_REQUEST(preq, -EBADF);
+            return;
+        }
+
+        spin_lock_irq(&plo->lock);
+        if (io_count == io->io_count && !(io_count & 1))
+            clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
+        spin_unlock_irq(&plo->lock);
+    }
+
      dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
  }
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 85863df..0d731ef 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -614,7 +614,8 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
    static void
  kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-         struct page * page, sector_t sec, unsigned long rw)
+        struct page * page, sector_t sec, unsigned long rw,
+        int do_fsync_if_delayed)
  {
      ploop_prepare_tracker(preq, sec);
  diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 1883674..96e428b 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -910,6 +910,7 @@ void ploop_index_update(struct ploop_request * preq)
      sector_t sec;
      unsigned long rw;
      unsigned long state = READ_ONCE(preq->state);
+    int do_fsync_if_delayed = 0;
        /* No way back, we are going to initiate index write. */
@@ -970,10 +971,13 @@ void ploop_index_update(struct ploop_request * preq)
      preq->req_rw &= ~REQ_FLUSH;
        /* Relocate requires consistent index update */
-    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
          rw |= (REQ_FLUSH | REQ_FUA);
+        do_fsync_if_delayed = 1;
+    }
  - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
+ top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw,
+                      do_fsync_if_delayed);
        put_page(page);
      return;
@@ -1096,6 +1100,7 @@ static void map_wb_complete(struct map_node * m, int err)
      unsigned int idx;
      sector_t sec;
      unsigned long rw;
+    int do_fsync_if_delayed = 0;
        /* First, complete processing of written back indices,
       * finally instantiate indices in mapping cache.
@@ -1193,8 +1198,10 @@ static void map_wb_complete(struct map_node * m, int err)
                state = READ_ONCE(preq->state);
              /* Relocate requires consistent index update */
-            if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+            if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
                  rw |= (REQ_FLUSH | REQ_FUA);
+                do_fsync_if_delayed = 1;
+            }
                preq->eng_state = PLOOP_E_INDEX_WB;
              get_page(page);
@@ -1221,8 +1228,8 @@ static void map_wb_complete(struct map_node * m, int err)
      plo->st.map_multi_writes++;
      top_delta->ops->map_index(top_delta, m->mn_start, &sec);
  - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec,
-                      rw);
+ top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw,
+                      do_fsync_if_delayed);
      put_page(page);
  }
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index deee8a7..b03565b 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -164,7 +164,8 @@ struct ploop_io_ops
void (*read_page)(struct ploop_io * io, struct ploop_request * preq,
                   struct page * page, sector_t sec);
void (*write_page)(struct ploop_io * io, struct ploop_request * preq,
-                  struct page * page, sector_t sec, unsigned long rw);
+                  struct page * page, sector_t sec, unsigned long rw,
+                  int do_fsync_if_delayed);
          int    (*sync_read)(struct ploop_io * io, struct page * page,


_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to