Maxim Patlasov <mpatla...@virtuozzo.com> writes:

> Dima,
>
> I agree with general approach of this patch, but there are some 
> (easy-to-fix) issues. See, please, inline comments below...
>
> On 06/20/2016 11:58 AM, Dmitry Monakhov wrote:
>> barrier code is broken in many ways:
>> Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
>> But request also can goes though ->dio_submit_alloc()->dio_submit_pad and 
>> write_page (for indexes)
>> So in case of grow_dev we have following sequance:
>>
>> E_RELOC_DATA_READ:
>>               ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>>                ->delta->allocate
>>                   ->io->submit_allloc: dio_submit_alloc
>>                     ->dio_submit_pad
>> E_DATA_WBI : data written, time to update index
>>                ->delta->allocate_complete:ploop_index_update
>>                  ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>>                  ->write_page
>>                  ->ploop_map_wb_complete
>>                    ->ploop_wb_complete_post_process
>>                      ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>> E_RELOC_NULLIFY:
>>
>>                 ->submit()
>>
>> BUG#2: currecntly kaio write_page silently ignores REQ_FUA
>
> Sorry, I can't agree, it actually does not ignore:
I've misstyped. I ment to say REQ_FLUSH.
>
>> static void
>> kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
>>          struct page * page, sector_t sec, int fua)
>> {
>>     /* No FUA in kaio, convert it to fsync */
>>     if (fua)
>>         set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
>
>
>> BUG#3: io_direct:dio_submit  if fua_delay is not possible we MUST tag all 
>> bios via REQ_FUA
>>         not just latest one.
>
> No need to tag *all*. See inline comments below.
>
>> This patch unify barrier handling like follows:
>> - Get rid of FORCE_{FLUSH,FUA}
>> - Introduce DELAYED_FLUSH, currecntly it supported only by io_direct
>> - fix up fua handling for dio_submit
>>
>> This makes reloc sequence optimal:
>> io_direct
>> RELOC_S: R1, W2, WBI:FLUSH|FUA
>> RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
>> io_kaio
>> RELOC_S: R1, W2:FUA, WBI:FUA
>> RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA
>>
>> https://jira.sw.ru/browse/PSBM-47107
>> Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org>
>> ---
>>   drivers/block/ploop/dev.c       |  8 +++++---
>>   drivers/block/ploop/io_direct.c | 29 +++++++++-----------------
>>   drivers/block/ploop/io_kaio.c   | 17 ++++++++++------
>>   drivers/block/ploop/map.c       | 45 
>> ++++++++++++++++++++++-------------------
>>   include/linux/ploop/ploop.h     |  8 ++++----
>>   5 files changed, 54 insertions(+), 53 deletions(-)
>>
>> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
>> index 96f7850..fbc5f2f 100644
>> --- a/drivers/block/ploop/dev.c
>> +++ b/drivers/block/ploop/dev.c
>> @@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct 
>> ploop_request * preq)
>>   
>>      __TRACE("Z %p %u\n", preq, preq->req_cluster);
>>   
>> +    if (!preq->error) {
>> +            WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
>> +    }
>>      while (preq->bl.head) {
>>              struct bio * bio = preq->bl.head;
>>              preq->bl.head = bio->bi_next;
>> @@ -2530,9 +2533,8 @@ restart:
>>              top_delta = ploop_top_delta(plo);
>>              sbl.head = sbl.tail = preq->aux_bio;
>>   
>> -            /* Relocated data write required sync before BAT updatee */
>> -            set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>> -
>> +            /* Relocated data write required sync before BAT updatee
>> +             * this will happen inside index_update */
>>              if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
>>                      preq->eng_state = PLOOP_E_DATA_WBI;
>>                      plo->st.bio_out++;
>> diff --git a/drivers/block/ploop/io_direct.c 
>> b/drivers/block/ploop/io_direct.c
>> index a6d83fe..d7ecd4a 100644
>> --- a/drivers/block/ploop/io_direct.c
>> +++ b/drivers/block/ploop/io_direct.c
>> @@ -90,21 +90,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * 
>> preq,
>>      trace_submit(preq);
>>   
>>      preflush = !!(rw & REQ_FLUSH);
>> -
>> -    if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
>> -            preflush = 1;
>> -
>> -    if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
>> -            postfua = 1;
>> -
>> -    if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
>> -
>> +    postfua = !!(rw & REQ_FUA);
>> +    if (ploop_req_delay_fua_possible(rw, preq)) {
>>              /* Mark req that delayed flush required */
>> -            set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
>> -    } else if (rw & REQ_FUA) {
>> -            postfua = 1;
>> +            set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
>> +            postfua = 0;
>>      }
>
> "postfua" is a horrible name, let us see if we can get rid of it 
> completely. Also, the way how ploop_req_delay_fua_possible implemented 
> is prone to errors (see below an issue in kaio_complete_io_state). Let's 
> rework it like this:
Let it be postflush.
>
>> static inline bool ploop_req_delay_fua_possible(struct ploop_request 
>> *preq)
>> {
>>     return preq->eng_state == PLOOP_E_DATA_WBI;
>> }
>
> Then, that chunk in the dio_submit above might look as:
>
>>     /* If we can delay, mark req that delayed flush required */
>>     if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq))
>>         set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
>
>
>
>> -
>>      rw &= ~(REQ_FLUSH | REQ_FUA);
>>   
>>   
>> @@ -238,14 +229,15 @@ flush_bio:
>>                      rw2 |= REQ_FLUSH;
>>                      preflush = 0;
>>              }
>> -            if (unlikely(postfua && !bl.head))
>> -                    rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
>> +            /* Very unlikely, but correct.
>> +             * TODO: Optimize postfua via DELAY_FLUSH for any req state */
>> +            if (unlikely(!postfua))
>> +                    rw2 |= REQ_FUA;
>
> If test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state), do nothing here -- 
> that's obvious. So let's assume !test_bit(PLOOP_REQ_DELAYED_FLUSH, 
> &preq->state)...
>
> There is a call to bio_add_page in dio_submit before that loop. The 
> moment bio_add_page returned zero (i.e. OK), we know for sure which bio 
> it came from: it is bw.cur. Then it's enough to mark new bio like this:
OK
>>     bio->bi_rw |= bw.cur & REQ_FUA;
>
> Then, instead of "submit_bio(rw2, b);", use "submit_bio(rw2 | b->bi_rw, 
> b);".
>
> This way we'll mark with FUA only those of new bio-s that has at least 
> some data (pages) from original bio-s with FUA. Makes sense?
>
> Btw, dio_io_page() must be fixed similarly (get rid of postfua, mark 
> with FUA only what we really need).
>
>>   
>>              ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
>>              submit_bio(rw2, b);
>>              bio_num++;
>>      }
>> -
>
> I liked this empty line, please keep it! ;)
>
>>      ploop_complete_io_request(preq);
>>      return;
>>   
>> @@ -1520,15 +1512,14 @@ dio_read_page(struct ploop_io * io, struct 
>> ploop_request * preq,
>>   
>>   static void
>>   dio_write_page(struct ploop_io * io, struct ploop_request * preq,
>> -           struct page * page, sector_t sec, int fua)
>> +           struct page * page, sector_t sec, unsigned long rw)
>>   {
>>      if (!(io->files.file->f_mode & FMODE_WRITE)) {
>>              PLOOP_FAIL_REQUEST(preq, -EBADF);
>>              return;
>>      }
>>   
>> -    dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
>> -                preq, page, sec);
>> +    dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
>>   }
>>   
>>   static int
>> diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
>> index de26319..0b93e13 100644
>> --- a/drivers/block/ploop/io_kaio.c
>> +++ b/drivers/block/ploop/io_kaio.c
>> @@ -72,13 +72,17 @@ static void kaio_complete_io_state(struct ploop_request 
>> * preq)
>>      }
>>   
>>      /* Convert requested fua to fsync */
>> -    if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
>> +    if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) ||
>
> io_kaio has nothing to do with DELAYED_FLUSH (only dio sets it). Hence, 
> no need to check it here.
Why not. delayed flush is possible for !reloc writes. In such cases
we may postpone data flush till index_update. So from kaio point of view
it will looks like follows: W1,WBI,FSYNC. AFAIU this is legal because
even if power failure happens we may have WBI, but not W1. In such case
index will point to zeroed data
>
>>          test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
>>              post_fsync = 1;
>>   
>>      if (!post_fsync &&
>> -        !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
>> -        (preq->req_rw & REQ_FUA))
>> +        !ploop_req_delay_fua_possible(preq->req_rw, preq))
>> +            post_fsync = 1;
>> +    /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua
>> +       which is not supporded by ->kaio_write_page */
>> +    if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
>> +        test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state))
>>              post_fsync = 1;
>
> I'm sorry, this looks like a mess. You can't freely clear this bit, and 
> RELOC_A missed. Can we replace all this lengthy code:
CRAP. Off course I just want to test_bit, not clear. Same on me.
>
>>     /* Convert requested fua to fsync */
>>     if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) ||
>>         test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
>>         post_fsync = 1;
>>
>>     if (!post_fsync &&
>>         !ploop_req_delay_fua_possible(preq->req_rw, preq))
>>         post_fsync = 1;
>>     /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua
>>        which is not supporded by ->kaio_write_page */
>>     if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
>>         test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state))
>>         post_fsync = 1;
>
> with something more brief like this:
>
>>     bool fua = preq->req_rw & REQ_FUA;
>>     unsigned long state = READ_ONCE(preq->state);
>>     bool reloc = state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL);
>>
>>     if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state) ||
>>         (reloc && ploop_req_delay_fua_possible(preq)) ||
>>         (fua && !ploop_req_delay_fua_possible(preq)))
>>         post_fsync = 1;
>
> Note, how it fixes a bug: if !(preq->req_rw & REQ_FUA), 
> delay_fua_possible(rw, preq) returned zero anyway. Hence we did 
> post_sync=1 even though the request has nothing to do with FUA!
>
>
>>   
>>      preq->req_rw &= ~REQ_FUA;
>> @@ -608,12 +612,13 @@ kaio_read_page(struct ploop_io * io, struct 
>> ploop_request * preq,
>>   
>>   static void
>>   kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
>> -             struct page * page, sector_t sec, int fua)
>> +             struct page * page, sector_t sec, unsigned long rw)
>>   {
>>      ploop_prepare_tracker(preq, sec);
>> -
>> +    /* This is async call , preflush is not supported */
>> +    BUG_ON(rw & REQ_FLUSH);
>>      /* No FUA in kaio, convert it to fsync */
>> -    if (fua)
>> +    if (rw & REQ_FUA)
>>              set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
>>   
>>      kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
>> diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
>> index 3a6365d..de9bb32 100644
>> --- a/drivers/block/ploop/map.c
>> +++ b/drivers/block/ploop/map.c
>> @@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
>>      struct ploop_device * plo = preq->plo;
>>      struct map_node * m = preq->map;
>>      struct ploop_delta * top_delta = map_top_delta(m->parent);
>> +    unsigned long rw = preq->req_rw & REQ_FUA;
>> +    unsigned long state = READ_ONCE(preq->state);
>>      u32 idx;
>>      map_index_t blk;
>>      int old_level;
>> @@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
>>      __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
>>      plo->st.map_single_writes++;
>>      top_delta->ops->map_index(top_delta, m->mn_start, &sec);
>> -    /* Relocate requires consistent writes, mark such reqs appropriately */
>> -    if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
>> -        test_bit(PLOOP_REQ_RELOC_S, &preq->state))
>> -            set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>> -
>> -    top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
>> -                                  !!(preq->req_rw & REQ_FUA));
>> +    /* Relocate requires consistent index update */
>> +    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>> +            rw |= REQ_FUA;
>> +    if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
>> +            rw |= REQ_FLUSH;
>> +    if (rw)
>> +            clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
>
> Can you please move clear_bit upward, like this:
>
>>     if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
>>         rw |= REQ_FLUSH;
>>         clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
>>     }
>
>> +    top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | 
>> WRITE);
>>      put_page(page);
>>      return;
>>   
>> @@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct 
>> ploop_map *map,
>>       * (see dio_submit()). So fsync of EXT4 image doesnt help us.
>>       * We need to force sync of nullified blocks.
>>       */
>> -    set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
>> +    preq->req_rw |= REQ_FUA;
>>      top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
>>                                &sbl, preq->iblock, 1<<plo->cluster_log);
>>   }
>> @@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int 
>> err)
>>      struct ploop_delta * top_delta = map_top_delta(m->parent);
>>      struct list_head * cursor, * tmp;
>>      struct ploop_request * main_preq;
>> +    unsigned long rw =  0;
>>      struct page * page = NULL;
>>      int delayed = 0;
>>      unsigned int idx;
>>      sector_t sec;
>> -    int fua, force_fua;
>> +    int fua;
>>   
>>      /* First, complete processing of written back indices,
>>       * finally instantiate indices in mapping cache.
>> @@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int 
>> err)
>>   
>>      main_preq = NULL;
>>      fua = 0;
>> -    force_fua = 0;
>>   
>>      list_for_each_safe(cursor, tmp, &m->io_queue) {
>>              struct ploop_request * preq;
>> +            unsigned long state;
>>   
>>              preq = list_entry(cursor, struct ploop_request, list);
>>   
>> @@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int 
>> err)
>>                              spin_unlock_irq(&plo->lock);
>>                              break;
>>                      }
>> -
>> -                    if (preq->req_rw & REQ_FUA)
>> -                            fua = 1;
>> -
>> -                    if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
>> -                        test_bit(PLOOP_REQ_RELOC_S, &preq->state))
>> -                            force_fua = 1;
>> +                    state = READ_ONCE(preq->state);
>> +                    /* Relocate requires consistent index update */
>> +                    rw |= preq->req_rw & REQ_FUA;
>> +                    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>> +                            rw |= REQ_FUA;
>> +                    if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
>> +                            rw |= REQ_FLUSH;
>> +                    if (rw)
>> +                            clear_bit(PLOOP_REQ_DELAYED_FLUSH, 
>> &preq->state);
>
> The same as above -- easier to read if clear_bit is inside DELAYED_FLUSH 

> "if".
>
> Thanks,
> Maxim
>
>>   
>>                      preq->eng_state = PLOOP_E_INDEX_WB;
>>                      get_page(page);
>> @@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int 
>> err)
>>      plo->st.map_multi_writes++;
>>      top_delta->ops->map_index(top_delta, m->mn_start, &sec);
>>   
>> -    if (force_fua)
>> -            set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
>> -
>> -    top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, 
>> fua);
>> +    top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw 
>> | WRITE);
>>      put_page(page);
>>   }
>>   
>> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
>> index 0fba25e..2f26824 100644
>> --- a/include/linux/ploop/ploop.h
>> +++ b/include/linux/ploop/ploop.h
>> @@ -157,7 +157,7 @@ struct ploop_io_ops
>>      void    (*read_page)(struct ploop_io * io, struct ploop_request * preq,
>>                           struct page * page, sector_t sec);
>>      void    (*write_page)(struct ploop_io * io, struct ploop_request * preq,
>> -                          struct page * page, sector_t sec, int fua);
>> +                          struct page * page, sector_t sec, unsigned long 
>> rw);
>>   
>>   
>>      int     (*sync_read)(struct ploop_io * io, struct page * page,
>> @@ -465,8 +465,7 @@ enum
>>      PLOOP_REQ_ZERO,
>>      PLOOP_REQ_DISCARD,
>>      PLOOP_REQ_RSYNC,
>> -    PLOOP_REQ_FORCE_FUA,    /*force fua of req write I/O by engine */
>> -    PLOOP_REQ_FORCE_FLUSH,  /*force flush by engine */
>> +    PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
>>      PLOOP_REQ_KAIO_FSYNC,   /*force image fsync by KAIO module */
>>      PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
>>      PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
>> @@ -477,6 +476,7 @@ enum
>>   #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
>>   #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
>>   #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
>> +#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
>>   #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
>>   #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
>>   
>> @@ -614,7 +614,7 @@ static inline int ploop_req_delay_fua_possible(unsigned 
>> long rw,
>>       * fua.
>>       */
>>      if (rw & REQ_FUA) {
>> -            if (preq->eng_state != PLOOP_E_COMPLETE)
>> +            if (preq->eng_state == PLOOP_E_DATA_WBI)
>>                      delay_fua = 1;
>>      }
>>      return delay_fua;

Attachment: signature.asc
Description: PGP signature

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to