On 21.07.2016 10:58, Stefan Bader wrote:
> I was pointed at the thread which seems to address the same after
> I wrote most of below text. Did not want to re-write this so please
> bear with the odd layout.
> 
> https://www.redhat.com/archives/dm-devel/2016-June/msg00015.html
> 
> Zhengyuan tries to fix the problem by relocating the superblock on
> disk. But I am not sure whether there is really any guarantee about
> how __bread fills data into the buffer_head. What if there is the next
> odd arch with 128K pages?
> 
> So below is an attempt to be more generic. Still I don't feel completely
> happy with the way that a page moves (or is shared) between buffer_head
> and biovec. What I tried to outline below is to let the register functions
> allocate bio+biovec memory and use the in-memory sb_cache data to initialize
> the biovec buffer.

Any opinions here? Also adding LKML as I don't seem to get through moderation on
dm-devel.


> 
> -Stefan
> 
> 
> ---
> 
> This was seen on ppc64le with 64k page size. The problem is that
> in that case __bread returns a page where b_data is not at the
> start of the page. And I am not really sure that the way bcache
> re-purposes the page returned by __bread to be used as biovec 
> element is really a good idea.
> 
> The way it is now, I saw __bread return a 64k page where b_data
> was starting at 4k offset. Then __write_super was modifying some
> data at offset 0 but for example not writing the magic number again.
> 
> Not sure why (maybe this messes up flushes, too) but the bad data was not
> immediately written back when the devices were registered. So at that time
> bcache-super-show would report data as it was written by user-space (like
> the cache type still 0 and not 3, and claiming the cache to still bei
> detached).
> 
> But when stopping the cache and unregistering the cache set this changed
> and bcache-super-show was reporting a bad magic number (as expected).
> 
> The patch below works around this (tested with 64k and 4k pages) but I
> am not really sure this is a clean approach. My gut feeling would be that
> the bio structures should be allocated speperately (I think there is a way
> of allocating a bioset without using a pool). Then that separate page could
> be pre-initialized from the super block structure without passing around
> a page the feels more private to __bread usage...
> 
> -Stefan
> 
> 
> 
> From 3652e98359b876f3c1e6d7b9b7305e3411178296 Mon Sep 17 00:00:00 2001
> From: Stefan Bader <stefan.ba...@canonical.com>
> Date: Wed, 20 Jul 2016 12:06:27 +0200
> Subject: [PATCH] bcache: handle non 4k pages returned by __bread
> 
> On non-x86 arches pages can be bigger than 4k. Currently read_super
> returns a reference to the page used as buffer in the buffer_head
> that is returned by __bread().
> With 4k page size and a requested read this normally ends up with 
> the super block data starting at offset 0. But as seen on ppc64le
> with 64k page size, the data can start at an offset (from what I
> saw the offset was 4k).
> This causes harm later on when __write_super() maps the super
> block data at the start of the page acquired before and also
> not writes back all fields (particularly the super block magic).
> 
> Try to avoid this by also returning the potential offset within the
> sb_page from read_super() and fully initiallize the single bvec of
> the bio used for __write_super() calls. Doing that is the same as
> would have been done in bch_bio_map() which now must not be used in
> __write_super().
> 
> Signed-off-by: Stefan Bader <stefan.ba...@canonical.com>
> 
> removedebug
> 
> Signed-off-by: Stefan Bader <stefan.ba...@canonical.com>
> ---
>  drivers/md/bcache/super.c | 22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e169739..3e0d2de 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -62,7 +62,7 @@ struct workqueue_struct *bcache_wq;
>  /* Superblock */
>  
>  static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
> -                           struct page **res)
> +                           struct page **res, unsigned int *off)
>  {
>       const char *err;
>       struct cache_sb *s;
> @@ -192,6 +192,7 @@ static const char *read_super(struct cache_sb *sb, struct 
> block_device *bdev,
>       err = NULL;
>  
>       get_page(bh->b_page);
> +     *off = (unsigned int) (bh->b_data - ((char *) 
> page_address(bh->b_page)));
>       *res = bh->b_page;
>  err:
>       put_bh(bh);
> @@ -202,19 +203,19 @@ static void write_bdev_super_endio(struct bio *bio)
>  {
>       struct cached_dev *dc = bio->bi_private;
>       /* XXX: error checking */
> -
>       closure_put(&dc->sb_write);
>  }
>  
>  static void __write_super(struct cache_sb *sb, struct bio *bio)
>  {
> -     struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
> +     struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page) +
> +                            bio->bi_io_vec[0].bv_offset;
>       unsigned i;
>  
>       bio->bi_iter.bi_sector  = SB_SECTOR;
>       bio->bi_rw              = REQ_SYNC|REQ_META;
>       bio->bi_iter.bi_size    = SB_SIZE;
> -     bch_bio_map(bio, NULL);
> +     // bch_bio_map(bio, NULL);
>  
>       out->offset             = cpu_to_le64(sb->offset);
>       out->version            = cpu_to_le64(sb->version);
> @@ -1139,6 +1140,7 @@ static int cached_dev_init(struct cached_dev *dc, 
> unsigned block_size)
>  /* Cached device - bcache superblock */
>  
>  static void register_bdev(struct cache_sb *sb, struct page *sb_page,
> +                              unsigned int sb_off,
>                                struct block_device *bdev,
>                                struct cached_dev *dc)
>  {
> @@ -1154,6 +1156,8 @@ static void register_bdev(struct cache_sb *sb, struct 
> page *sb_page,
>       dc->sb_bio.bi_max_vecs  = 1;
>       dc->sb_bio.bi_io_vec    = dc->sb_bio.bi_inline_vecs;
>       dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
> +     dc->sb_bio.bi_io_vec[0].bv_len = SB_SIZE;
> +     dc->sb_bio.bi_io_vec[0].bv_offset = sb_off;
>       get_page(sb_page);
>  
>       if (cached_dev_init(dc, sb->block_size << 9))
> @@ -1839,6 +1843,7 @@ static int cache_alloc(struct cache_sb *sb, struct 
> cache *ca)
>  }
>  
>  static int register_cache(struct cache_sb *sb, struct page *sb_page,
> +                             unsigned int sb_off,
>                               struct block_device *bdev, struct cache *ca)
>  {
>       char name[BDEVNAME_SIZE];
> @@ -1853,6 +1858,8 @@ static int register_cache(struct cache_sb *sb, struct 
> page *sb_page,
>       ca->sb_bio.bi_max_vecs  = 1;
>       ca->sb_bio.bi_io_vec    = ca->sb_bio.bi_inline_vecs;
>       ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
> +     ca->sb_bio.bi_io_vec[0].bv_len  = SB_SIZE;
> +     ca->sb_bio.bi_io_vec[0].bv_offset = sb_off;
>       get_page(sb_page);
>  
>       if (blk_queue_discard(bdev_get_queue(ca->bdev)))
> @@ -1936,6 +1943,7 @@ static ssize_t register_bcache(struct kobject *k, 
> struct kobj_attribute *attr,
>       struct cache_sb *sb = NULL;
>       struct block_device *bdev = NULL;
>       struct page *sb_page = NULL;
> +     unsigned int sb_off;
>  
>       if (!try_module_get(THIS_MODULE))
>               return -EBUSY;
> @@ -1967,7 +1975,7 @@ static ssize_t register_bcache(struct kobject *k, 
> struct kobj_attribute *attr,
>       if (set_blocksize(bdev, 4096))
>               goto err_close;
>  
> -     err = read_super(sb, bdev, &sb_page);
> +     err = read_super(sb, bdev, &sb_page, &sb_off);
>       if (err)
>               goto err_close;
>  
> @@ -1977,14 +1985,14 @@ static ssize_t register_bcache(struct kobject *k, 
> struct kobj_attribute *attr,
>                       goto err_close;
>  
>               mutex_lock(&bch_register_lock);
> -             register_bdev(sb, sb_page, bdev, dc);
> +             register_bdev(sb, sb_page, sb_off, bdev, dc);
>               mutex_unlock(&bch_register_lock);
>       } else {
>               struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
>               if (!ca)
>                       goto err_close;
>  
> -             if (register_cache(sb, sb_page, bdev, ca) != 0)
> +             if (register_cache(sb, sb_page, sb_off, bdev, ca) != 0)
>                       goto err_close;
>       }
>  out:
> 


Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to