barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and 
write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
             ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
              ->delta->allocate
                 ->io->submit_allloc: dio_submit_alloc
                   ->dio_submit_pad
E_DATA_WBI : data written, time to update index
              ->delta->allocate_complete:ploop_index_update
                ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
                ->write_page
                ->ploop_map_wb_complete
                  ->ploop_wb_complete_post_process
                    ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
E_RELOC_NULLIFY:

               ->submit()

BUG#2: currecntly kaio write_page silently ignores REQ_FLUSH
BUG#3: io_direct:dio_submit  if fua_delay is not possible we MUST tag all bios 
via REQ_FUA
       not just latest one.
This patch unify barrier handling like follows:
- Get rid of FORCE_{FLUSH,FUA}
- Introduce DELAYED_FLUSH
- fix fua handling for dio_submit
- BUG_ON for REQ_FLUSH in kaio_page_write

This makes reloc sequence optimal:
io_direct
RELOC_S: R1, W2, WBI:FLUSH|FUA
RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
io_kaio
RELOC_S: R1, W2:FUA, WBI:FUA
RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org>
---
 drivers/block/ploop/dev.c       |  8 +++++---
 drivers/block/ploop/io_direct.c | 30 ++++++++++-----------------
 drivers/block/ploop/io_kaio.c   | 23 +++++++++++++--------
 drivers/block/ploop/map.c       | 45 ++++++++++++++++++++++-------------------
 include/linux/ploop/ploop.h     | 19 +++++------------
 5 files changed, 60 insertions(+), 65 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 96f7850..fbc5f2f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * 
preq)
 
        __TRACE("Z %p %u\n", preq, preq->req_cluster);
 
+       if (!preq->error) {
+               WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
+       }
        while (preq->bl.head) {
                struct bio * bio = preq->bl.head;
                preq->bl.head = bio->bi_next;
@@ -2530,9 +2533,8 @@ restart:
                top_delta = ploop_top_delta(plo);
                sbl.head = sbl.tail = preq->aux_bio;
 
-               /* Relocated data write required sync before BAT updatee */
-               set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
+               /* Relocated data write required sync before BAT updatee
+                * this will happen inside index_update */
                if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
                        preq->eng_state = PLOOP_E_DATA_WBI;
                        plo->st.bio_out++;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index a6d83fe..303eb70 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -83,28 +83,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
        int err;
        struct bio_list_walk bw;
        int preflush;
-       int postfua = 0;
+       int fua = 0;
        int write = !!(rw & REQ_WRITE);
        int bio_num;
 
        trace_submit(preq);
 
        preflush = !!(rw & REQ_FLUSH);
-
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
-               preflush = 1;
-
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
-               postfua = 1;
-
-       if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
+       fua = !!(rw & REQ_FUA);
+       if (fua && ploop_req_delay_fua_possible(rw, preq)) {
                /* Mark req that delayed flush required */
-               set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
-       } else if (rw & REQ_FUA) {
-               postfua = 1;
+               set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+               fua = 0;
        }
-
        rw &= ~(REQ_FLUSH | REQ_FUA);
 
 
@@ -238,8 +229,10 @@ flush_bio:
                        rw2 |= REQ_FLUSH;
                        preflush = 0;
                }
-               if (unlikely(postfua && !bl.head))
-                       rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
+               /* Very unlikely, but correct.
+                * TODO: Optimize postfua via DELAY_FLUSH for any req state */
+               if (unlikely(fua))
+                       rw2 |= REQ_FUA;
 
                ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
                submit_bio(rw2, b);
@@ -1520,15 +1513,14 @@ dio_read_page(struct ploop_io * io, struct 
ploop_request * preq,
 
 static void
 dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-              struct page * page, sector_t sec, int fua)
+              struct page * page, sector_t sec, unsigned long rw)
 {
        if (!(io->files.file->f_mode & FMODE_WRITE)) {
                PLOOP_FAIL_REQUEST(preq, -EBADF);
                return;
        }
 
-       dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
-                   preq, page, sec);
+       dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
 }
 
 static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..c3bdd20 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -61,6 +61,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
        struct ploop_device * plo   = preq->plo;
        unsigned long flags;
        int post_fsync = 0;
+       unsigned long state = READ_ONCE(preq->state);
 
        if (preq->error || !(preq->req_rw & REQ_FUA) ||
            preq->eng_state == PLOOP_E_INDEX_READ ||
@@ -72,18 +73,23 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
        }
 
        /* Convert requested fua to fsync */
-       if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
-           test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
+       if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
                post_fsync = 1;
 
-       if (!post_fsync &&
-           !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
-           (preq->req_rw & REQ_FUA))
+       if ((preq->req_rw & REQ_FUA) &&
+           !ploop_req_delay_fua_possible(preq->req_rw, preq))
+               post_fsync = 1;
+       /* Usually we may delay fua for regular requets, but relocation
+        * requires REQ_FLUSH_FUA for index_update which is not supported
+        * by kaio_write_page(). Flush it now.
+        */
+       if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
                post_fsync = 1;
 
        preq->req_rw &= ~REQ_FUA;
 
        if (post_fsync) {
+               test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
                spin_lock_irqsave(&plo->lock, flags);
                kaio_queue_fsync_req(preq);
                plo->st.bio_syncwait++;
@@ -608,12 +614,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request 
* preq,
 
 static void
 kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-                struct page * page, sector_t sec, int fua)
+                struct page * page, sector_t sec, unsigned long rw)
 {
        ploop_prepare_tracker(preq, sec);
-
+       /* This is async call , preflush is not supported */
+       BUG_ON(rw & REQ_FLUSH);
        /* No FUA in kaio, convert it to fsync */
-       if (fua)
+       if (rw & REQ_FUA)
                set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
 
        kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..62f7d79 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
        struct ploop_device * plo = preq->plo;
        struct map_node * m = preq->map;
        struct ploop_delta * top_delta = map_top_delta(m->parent);
+       unsigned long rw = preq->req_rw & REQ_FUA;
+       unsigned long state = READ_ONCE(preq->state);
        u32 idx;
        map_index_t blk;
        int old_level;
@@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
        __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
        plo->st.map_single_writes++;
        top_delta->ops->map_index(top_delta, m->mn_start, &sec);
-       /* Relocate requires consistent writes, mark such reqs appropriately */
-       if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-           test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-               set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
-       top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
-                                     !!(preq->req_rw & REQ_FUA));
+       /* Relocate requires consistent index update */
+       if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+               rw |= REQ_FUA;
+       if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+               rw |= REQ_FLUSH;
+               clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+       }
+       top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | 
WRITE);
        put_page(page);
        return;
 
@@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map 
*map,
         * (see dio_submit()). So fsync of EXT4 image doesnt help us.
         * We need to force sync of nullified blocks.
         */
-       set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
+       preq->req_rw |= REQ_FUA;
        top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
                                  &sbl, preq->iblock, 1<<plo->cluster_log);
 }
@@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int 
err)
        struct ploop_delta * top_delta = map_top_delta(m->parent);
        struct list_head * cursor, * tmp;
        struct ploop_request * main_preq;
+       unsigned long rw =  0;
        struct page * page = NULL;
        int delayed = 0;
        unsigned int idx;
        sector_t sec;
-       int fua, force_fua;
+       int fua;
 
        /* First, complete processing of written back indices,
         * finally instantiate indices in mapping cache.
@@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int 
err)
 
        main_preq = NULL;
        fua = 0;
-       force_fua = 0;
 
        list_for_each_safe(cursor, tmp, &m->io_queue) {
                struct ploop_request * preq;
+               unsigned long state;
 
                preq = list_entry(cursor, struct ploop_request, list);
 
@@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int 
err)
                                spin_unlock_irq(&plo->lock);
                                break;
                        }
-
-                       if (preq->req_rw & REQ_FUA)
-                               fua = 1;
-
-                       if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-                           test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-                               force_fua = 1;
+                       state = READ_ONCE(preq->state);
+                       /* Relocate requires consistent index update */
+                       rw |= preq->req_rw & REQ_FUA;
+                       if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+                               rw |= REQ_FUA;
+                       if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+                               rw |= REQ_FLUSH;
+                               clear_bit(PLOOP_REQ_DELAYED_FLUSH, 
&preq->state);
+                       }
 
                        preq->eng_state = PLOOP_E_INDEX_WB;
                        get_page(page);
@@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err)
        plo->st.map_multi_writes++;
        top_delta->ops->map_index(top_delta, m->mn_start, &sec);
 
-       if (force_fua)
-               set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
-
-       top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, 
fua);
+       top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw 
| WRITE);
        put_page(page);
 }
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..5af32ba 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -157,7 +157,7 @@ struct ploop_io_ops
        void    (*read_page)(struct ploop_io * io, struct ploop_request * preq,
                             struct page * page, sector_t sec);
        void    (*write_page)(struct ploop_io * io, struct ploop_request * preq,
-                             struct page * page, sector_t sec, int fua);
+                             struct page * page, sector_t sec, unsigned long 
rw);
 
 
        int     (*sync_read)(struct ploop_io * io, struct page * page,
@@ -465,8 +465,7 @@ enum
        PLOOP_REQ_ZERO,
        PLOOP_REQ_DISCARD,
        PLOOP_REQ_RSYNC,
-       PLOOP_REQ_FORCE_FUA,    /*force fua of req write I/O by engine */
-       PLOOP_REQ_FORCE_FLUSH,  /*force flush by engine */
+       PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
        PLOOP_REQ_KAIO_FSYNC,   /*force image fsync by KAIO module */
        PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
        PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
@@ -477,6 +476,7 @@ enum
 #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
 #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
 #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
 #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
 #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
 
@@ -607,17 +607,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct 
list_head *drop_list,
 static inline int ploop_req_delay_fua_possible(unsigned long rw,
        struct ploop_request *preq)
 {
-       int delay_fua = 0;
-
-       /* In case of eng_state != COMPLETE, we'll do FUA in
-        * ploop_index_update(). Otherwise, we should post
-        * fua.
-        */
-       if (rw & REQ_FUA) {
-               if (preq->eng_state != PLOOP_E_COMPLETE)
-                       delay_fua = 1;
-       }
-       return delay_fua;
+       /* data may be flushed during ploop_index_update() */
+       return  (preq->eng_state == PLOOP_E_DATA_WBI);
 }
 
 static inline void ploop_req_set_error(struct ploop_request * preq, int err)
-- 
1.8.3.1

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to