[RFC PATCH 01/37] block: introduce bio_init_fields() helper

2021-01-19 Thread Chaitanya Kulkarni
There are several places in the file-system, block layer, device drivers
where struct bio members such as bdev, sector, private, end io callback,
io priority, write hints are initialized where we can use a helper
function.

This pach introduces a helper function which we use in the block lyaer
code. Subsequent patches use this function to reduce repeated code.

Signed-off-by: Chaitanya Kulkarni 
---
 block/blk-lib.c | 13 +
 include/linux/bio.h | 13 +
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 752f9c722062..5ee488c1bcc6 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -95,8 +95,7 @@ int __blkdev_issue_discard(struct block_device *bdev, 
sector_t sector,
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
 
bio = blk_next_bio(bio, 0, gfp_mask);
-   bio->bi_iter.bi_sector = sector;
-   bio_set_dev(bio, bdev);
+   bio_init_fields(bio, bdev, sector, NULL, NULL, 0, 0);
bio_set_op_attrs(bio, op, 0);
 
bio->bi_iter.bi_size = req_sects << 9;
@@ -189,8 +188,7 @@ static int __blkdev_issue_write_same(struct block_device 
*bdev, sector_t sector,
 
while (nr_sects) {
bio = blk_next_bio(bio, 1, gfp_mask);
-   bio->bi_iter.bi_sector = sector;
-   bio_set_dev(bio, bdev);
+   bio_init_fields(bio, bdev, sector, NULL, NULL, 0, 0);
bio->bi_vcnt = 1;
bio->bi_io_vec->bv_page = page;
bio->bi_io_vec->bv_offset = 0;
@@ -265,8 +263,7 @@ static int __blkdev_issue_write_zeroes(struct block_device 
*bdev,
 
while (nr_sects) {
bio = blk_next_bio(bio, 0, gfp_mask);
-   bio->bi_iter.bi_sector = sector;
-   bio_set_dev(bio, bdev);
+   bio_init_fields(bio, bdev, sector, NULL, NULL, 0, 0);
bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
@@ -317,8 +314,8 @@ static int __blkdev_issue_zero_pages(struct block_device 
*bdev,
while (nr_sects != 0) {
bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
   gfp_mask);
-   bio->bi_iter.bi_sector = sector;
-   bio_set_dev(bio, bdev);
+   bio_init_fields(bio, bdev, sector, NULL, NULL, 0, 0);
+
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
while (nr_sects != 0) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1edda614f7ce..fbeaa5e42a5d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -820,4 +820,17 @@ static inline void bio_set_polled(struct bio *bio, struct 
kiocb *kiocb)
bio->bi_opf |= REQ_NOWAIT;
 }
 
+static inline void bio_init_fields(struct bio *bio, struct block_device *bdev,
+  sector_t sect, void *priv,
+  bio_end_io_t *end_io,
+  unsigned short prio, unsigned short whint)
+{
+   bio_set_dev(bio, bdev);
+   bio->bi_iter.bi_sector = sect;
+   bio->bi_private = priv;
+   bio->bi_end_io = end_io;
+   bio->bi_ioprio = prio;
+   bio->bi_write_hint = whint;
+}
+
 #endif /* __LINUX_BIO_H */
-- 
2.22.1



[RFC PATCH 02/37] fs: use bio_init_fields in block_dev

2021-01-19 Thread Chaitanya Kulkarni
Signed-off-by: Chaitanya Kulkarni 
---
 fs/block_dev.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3e5b02f6606c..44b992976ee5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -239,12 +239,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
}
 
bio_init(&bio, vecs, nr_pages);
-   bio_set_dev(&bio, bdev);
-   bio.bi_iter.bi_sector = pos >> 9;
-   bio.bi_write_hint = iocb->ki_hint;
-   bio.bi_private = current;
-   bio.bi_end_io = blkdev_bio_end_io_simple;
-   bio.bi_ioprio = iocb->ki_ioprio;
+   bio_init_fields(&bio, bdev, pos >> 9, current, blkdev_bio_end_io_simple,
+   iocb->ki_ioprio, iocb->ki_hint);
+
 
ret = bio_iov_iter_get_pages(&bio, iter);
if (unlikely(ret))
@@ -390,12 +387,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
blk_start_plug(&plug);
 
for (;;) {
-   bio_set_dev(bio, bdev);
-   bio->bi_iter.bi_sector = pos >> 9;
-   bio->bi_write_hint = iocb->ki_hint;
-   bio->bi_private = dio;
-   bio->bi_end_io = blkdev_bio_end_io;
-   bio->bi_ioprio = iocb->ki_ioprio;
+   bio_init_fields(bio, bdev, pos >> 9, dio, blkdev_bio_end_io,
+   iocb->ki_ioprio, iocb->ki_hint);
 
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
-- 
2.22.1



[RFC PATCH 00/37] block: introduce bio_init_fields()

2021-01-19 Thread Chaitanya Kulkarni
Hi,

This is a *compile only RFC* which adds a generic helper to initialize
the various fields of the bio that is repeated all the places in
file-systems, block layer, and drivers.

The new helper allows callers to initialize various members such as
bdev, sector, private, end io callback, io priority, and write hints.

The objective of this RFC is to only start a discussion, this it not 
completely tested at all. ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? 
?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? 
?? ?? ?? ?? ?? ?? ?? ?? ?? ??
Following diff shows code level benefits of this helper :-
??38 files changed, 124 insertions(+), 236 deletions(-)

-ck

Chaitanya Kulkarni (37):
  block: introduce bio_init_fields() helper
  fs: use bio_init_fields in block_dev
  btrfs: use bio_init_fields in disk-io
  btrfs: use bio_init_fields in volumes
  ext4: use bio_init_fields in page_io
  gfs2: use bio_init_fields in lops
  gfs2: use bio_init_fields in meta_io
  gfs2: use bio_init_fields in ops_fstype
  iomap: use bio_init_fields in buffered-io
  iomap: use bio_init_fields in direct-io
  jfs: use bio_init_fields in logmgr
  zonefs: use bio_init_fields in append
  drdb: use bio_init_fields in actlog
  drdb: use bio_init_fields in bitmap
  drdb: use bio_init_fields in receiver
  floppy: use bio_init_fields
  pktcdvd: use bio_init_fields
  bcache: use bio_init_fields in journal
  bcache: use bio_init_fields in super
  bcache: use bio_init_fields in writeback
  dm-bufio: use bio_init_fields
  dm-crypt: use bio_init_fields
  dm-zoned: use bio_init_fields metadata
  dm-zoned: use bio_init_fields target
  dm-zoned: use bio_init_fields
  dm log writes: use bio_init_fields
  nvmet: use bio_init_fields in bdev-ns
  target: use bio_init_fields in iblock
  btrfs: use bio_init_fields in scrub
  fs: use bio_init_fields in buffer
  eros: use bio_init_fields in data
  eros: use bio_init_fields in zdata
  jfs: use bio_init_fields in metadata
  nfs: use bio_init_fields in blocklayout
  ocfs: use bio_init_fields in heartbeat
  xfs: use bio_init_fields in xfs_buf
  xfs: use bio_init_fields in xfs_log

 block/blk-lib.c | 13 +
 drivers/block/drbd/drbd_actlog.c|  5 +
 drivers/block/drbd/drbd_bitmap.c|  5 +
 drivers/block/drbd/drbd_receiver.c  | 11 +++
 drivers/block/floppy.c  |  5 +
 drivers/block/pktcdvd.c | 12 
 drivers/md/bcache/journal.c | 21 -
 drivers/md/bcache/super.c   | 19 +--
 drivers/md/bcache/writeback.c   | 14 ++
 drivers/md/dm-bufio.c   |  5 +
 drivers/md/dm-crypt.c   |  4 +---
 drivers/md/dm-log-writes.c  | 21 ++---
 drivers/md/dm-zoned-metadata.c  | 15 +--
 drivers/md/dm-zoned-target.c|  9 +++--
 drivers/md/md.c |  6 ++
 drivers/nvme/target/io-cmd-bdev.c   |  4 +---
 drivers/target/target_core_iblock.c | 11 +++
 fs/block_dev.c  | 17 +
 fs/btrfs/disk-io.c  | 11 ---
 fs/btrfs/scrub.c|  6 ++
 fs/btrfs/volumes.c  |  4 +---
 fs/buffer.c |  7 ++-
 fs/erofs/data.c |  6 ++
 fs/erofs/zdata.c|  9 +++--
 fs/ext4/page-io.c   |  6 ++
 fs/gfs2/lops.c  |  6 ++
 fs/gfs2/meta_io.c   |  5 ++---
 fs/gfs2/ops_fstype.c|  7 ++-
 fs/iomap/buffered-io.c  |  5 ++---
 fs/iomap/direct-io.c| 15 +--
 fs/jfs/jfs_logmgr.c | 16 
 fs/jfs/jfs_metapage.c   | 16 +++-
 fs/nfs/blocklayout/blocklayout.c|  8 ++--
 fs/ocfs2/cluster/heartbeat.c|  4 +---
 fs/xfs/xfs_buf.c|  6 ++
 fs/xfs/xfs_log.c|  6 ++
 fs/zonefs/super.c   |  7 +++
 include/linux/bio.h | 13 +
 38 files changed, 124 insertions(+), 236 deletions(-)

-- 
2.22.1



Re: "bad tree block start" when trying to mount on ARM

2021-01-19 Thread Erik Jensen
On Mon, Jan 18, 2021 at 4:12 AM Erik Jensen  wrote:
>
> The offending system is indeed ARMv7 (specifically a Marvell ARMADA®
> 388), but I believe the Broadcom BCM2835 in my Raspberry Pi is
> actually ARMv6 (with hardware float support).

Using NBD, I have verified that I receive the same error when
attempting to mount the filesystem on my ARMv6 Raspberry Pi:
[ 3491.339572] BTRFS info (device dm-4): disk space caching is enabled
[ 3491.394584] BTRFS info (device dm-4): has skinny extents
[ 3492.385095] BTRFS error (device dm-4): bad tree block start, want
26207780683776 have 3395945502747707095
[ 3492.514071] BTRFS error (device dm-4): bad tree block start, want
26207780683776 have 3395945502747707095
[ 3492.553599] BTRFS warning (device dm-4): failed to read tree root
[ 3492.865368] BTRFS error (device dm-4): open_ctree failed

The Raspberry Pi is running Linux 5.4.83.

> On Mon, Jan 18, 2021 at 4:01 AM Qu Wenruo  wrote:
> >
> >
> >
> > On 2021/1/18 下午7:55, Erik Jensen wrote:
> > > On Mon, Jan 18, 2021 at 3:07 AM Qu Wenruo  wrote:
> > >> On 2021/1/18 下午6:33, Erik Jensen wrote:
> > >>> I ended up having other priorities occupying my time since 2019, and the
> > >>> "solution" of exporting the individual drives on my NAS using NBD and
> > >>> mounting them on my desktop worked, even if it wasn't pretty.
> > >>>
> > >>> However, I am currently looking into Syncthing, which I would like to
> > >>> run on the NAS directly. That would, of course, require accessing the
> > >>> filesystem directly on the NAS rather than just exporting the raw
> > >>> devices, which means circling back to this issue.
> > >>>
> > >>> After updating my NAS, I have determined that the issue still occurs
> > >>> with Linux 5.8.
> > >>>
> > >>> What's the next best step for debugging the issue? Ideally, I'd like to
> > >>> help track down the issue to find a proper fix, rather than just trying
> > >>> to bypass the issue. I wasn't sure if the suggestion to comment out
> > >>> btrfs_verify_dev_extents() was more geared toward the former or the 
> > >>> latter.
> > >>
> > >> After rewinding my memory on this case, the problem is really that the
> > >> ARM btrfs kernel is reading garbage, while X86 or ARM user space tool
> > >> works as expected.
> > >>
> > >> Can you recompile your kernel on the ARM board to add extra debugging
> > >> messages?
> > >> If possible, we can try to add some extra debug points to bombarding
> > >> your dmesg.
> > >>
> > >> Or do you have other ARM boards to test the same fs?
> > >>
> > >>
> > >> Thanks,
> > >> Qu
> > >
> > > It's pretty easy to build a kernel with custom patches applied, though
> > > the actual building takes a while, so I'd be happy to add whatever
> > > debug messages would be useful. I also have an old Raspberry Pi
> > > (original model B) I can dig out and try to get going, tomorrow. I
> > > can't hook it up to the drives directly, but I should be able to
> > > access them via NBD like I was doing from my desktop.
> >
> > RPI 1B would be a little slow but should be enough to expose the
> > problem, if the problem is for all arm builds (as long as you're also
> > using armv7 for the offending system).
> >
> > Thanks,
> > Qu
> >
> > > If I can't get
> > > that going for whatever reason, I could also try running an emulated
> > > ARM system with QEMU.
> > >
> > >>>
> > >>> On Fri, Jun 28, 2019 at 1:15 AM Qu Wenruo  > >>> > wrote:
> > >>>
> > >>>
> > >>>
> > >>>  On 2019/6/28 下午4:00, Erik Jensen wrote:
> > >>>   >> So it's either the block layer reading some wrong from the disk
> > >>>  or btrfs
> > >>>   >> layer doesn't do correct endian convert.
> > >>>   >
> > >>>   > My ARM board is running in little endian mode, so it doesn't 
> > >>> seem
> > >>>  like
> > >>>   > endianness should be an issue. (It is 32-bits versus my 
> > >>> desktop's 64,
> > >>>   > though.) I've also tried exporting the drives via NBD to my 
> > >>> x86_64
> > >>>   > system, and that worked fine, so if the problem is under btrfs, 
> > >>> it
> > >>>   > would have to be in the encryption layer, but fsck succeeding 
> > >>> on the
> > >>>   > ARM board would seem to rule that out, as well.
> > >>>   >
> > >>>   >> Would you dump the following data (X86 and ARM should output 
> > >>> the
> > >>>  same
> > >>>   >> content, thus one output is enough).
> > >>>   >> # btrfs ins dump-tree -b 17628726968320 /dev/dm-3
> > >>>   >> # btrfs ins dump-tree -b 17628727001088 /dev/dm-3
> > >>>   >
> > >>>   > Attached, and also 17628705964032, since that's the block
> > >>>  mentioned in
> > >>>   > my most recent mount attempt (see below).
> > >>>
> > >>>  The trees are completely fine.
> > >>>
> > >>>  So it should be something else causing the problem.
> > >>>
> > >>>   >
> > >>>   >> And then, for the ARM system, please apply the following diff,
> > >>>  and try
> > >>>   >> mount a

[PATCH 03/13] btrfs: Fix function description format

2021-01-19 Thread Nikolay Borisov
This fixes following W=1 warnings:

fs/btrfs/file-item.c:27: warning: Cannot understand  * @inode:  the inode we 
want to update the disk_i_size for
 on line 27 - I thought it was a doc line
fs/btrfs/file-item.c:65: warning: Cannot understand  * @inode - the inode we're 
modifying
 on line 65 - I thought it was a doc line
fs/btrfs/file-item.c:91: warning: Cannot understand  * @inode - the inode we're 
modifying
 on line 91 - I thought it was a doc line

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/file-item.c | 22 ++
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6ccfc019ad90..868b27e887b1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,8 +24,10 @@
   PAGE_SIZE))
 
 /**
- * @inode - the inode we want to update the disk_i_size for
- * @new_i_size - the i_size we want to set to, 0 if we use i_size
+ * btrfs_inode_safe_disk_i_size_write
+ *
+ * @inode:  the inode we want to update the disk_i_size for
+ * @new_i_size: the i_size we want to set to, 0 if we use i_size
  *
  * With NO_HOLES set this simply sets the disk_is_size to whatever 
i_size_read()
  * returns as it is perfectly fine with a file that has holes without hole file
@@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode 
*inode, u64 new_i_siz
 }
 
 /**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * btrfs_inode_set_file_extent_range
+ *
+ * @inode:  the inode we're modifying
+ * @start:  the start file offset of the file extent we've inserted
+ * @len:  the logical length of the file extent item
  *
  * Call when we are inserting a new file extent where there was none before.
  * Does not need to call this in the case where we're replacing an existing 
file
@@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode 
*inode, u64 start,
 }
 
 /**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * btrfs_inode_clear_file_extent_range
+ *
+ * @inode:  the inode we're modifying
+ * @start:  the start file offset of the file extent we've inserted
+ * @len:  the logical length of the file extent item
  *
  * Called when we drop a file extent, for example when we truncate.  Doesn't
  * need to be called for cases where we're replacing a file extent, like when
-- 
2.25.1



[PATCH 02/13] btrfs: Fix parameter description of btrfs_add_extent_mapping

2021-01-19 Thread Nikolay Borisov
This fixes the following compiler warnings:

fs/btrfs/extent_map.c:601: warning: Function parameter or member 'fs_info' not 
described in 'btrfs_add_extent_mapping'
fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_tree' not 
described in 'btrfs_add_extent_mapping'
fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_in' not 
described in 'btrfs_add_extent_mapping'
fs/btrfs/extent_map.c:601: warning: Function parameter or member 'start' not 
described in 'btrfs_add_extent_mapping'
fs/btrfs/extent_map.c:601: warning: Function parameter or member 'len' not 
described in 'btrfs_add_extent_mapping'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_map.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index aa1ff6f2ade9..90ae3c31e7fb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -577,11 +577,11 @@ static noinline int merge_extent_mapping(struct 
extent_map_tree *em_tree,
 
 /**
  * btrfs_add_extent_mapping - add extent mapping into em_tree
- * @fs_info - used for tracepoint
- * @em_tree - the extent tree into which we want to insert the extent mapping
- * @em_in   - extent we are inserting
- * @start   - start of the logical range btrfs_get_extent() is requesting
- * @len - length of the logical range btrfs_get_extent() is requesting
+ * @fs_info:  used for tracepoint
+ * @em_tree:  the extent tree into which we want to insert the extent mapping
+ * @em_in:   extent we are inserting
+ * @start:start of the logical range btrfs_get_extent() is requesting
+ * @len:  length of the logical range btrfs_get_extent() is requesting
  *
  * Note that @em_in's range may be different from [start, start+len),
  * but they must be overlapped.
-- 
2.25.1



[PATCH 06/13] btrfs: Document now parameter of peek_discard_list

2021-01-19 Thread Nikolay Borisov
Fixes fs/btrfs/discard.c:203: warning: Function parameter or member 'now' not 
described in 'peek_discard_list'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/discard.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 2b8383d41144..bfe53eb4c1f3 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -189,6 +189,7 @@ static struct btrfs_block_group *find_next_block_group(
  * @discard_ctl: discard control
  * @discard_state: the discard_state of the block_group after state management
  * @discard_index: the discard_index of the block_group after state management
+ * @now: Time when discard was invoked, in ns
  *
  * This wraps find_next_block_group() and sets the block_group to be in use.
  * discard_state's control flow is managed here.  Variables related to
-- 
2.25.1



[PATCH 13/13] lib/zstd: Convert constants to defines

2021-01-19 Thread Nikolay Borisov
Those constants are really used internally by zstd and including
linux/zstd.h into users results in the following warnings:

In file included from fs/btrfs/zstd.c:19:
./include/linux/zstd.h:798:21: warning: ‘ZSTD_skippableHeaderSize’ defined but 
not used [-Wunused-const-variable=]
  798 | static const size_t ZSTD_skippableHeaderSize = 8;
  | ^~~~
./include/linux/zstd.h:796:21: warning: ‘ZSTD_frameHeaderSize_max’ defined but 
not used [-Wunused-const-variable=]
  796 | static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
  | ^~~~
./include/linux/zstd.h:795:21: warning: ‘ZSTD_frameHeaderSize_min’ defined but 
not used [-Wunused-const-variable=]
  795 | static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
  | ^~~~
./include/linux/zstd.h:794:21: warning: ‘ZSTD_frameHeaderSize_prefix’ defined 
but not used [-Wunused-const-variable=]
  794 | static const size_t ZSTD_frameHeaderSize_prefix = 5;

So fix those warnings by turning the constants into defines.

Signed-off-by: Nikolay Borisov 
---
 include/linux/zstd.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 249575e2485f..e87f78c9b19c 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -791,11 +791,11 @@ size_t ZSTD_DStreamOutSize(void);
 /* for static allocation */
 #define ZSTD_FRAMEHEADERSIZE_MAX 18
 #define ZSTD_FRAMEHEADERSIZE_MIN  6
-static const size_t ZSTD_frameHeaderSize_prefix = 5;
-static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
-static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+#define ZSTD_frameHeaderSize_prefix 5
+#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN
+#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX
 /* magic number + skippable frame length */
-static const size_t ZSTD_skippableHeaderSize = 8;
+#define ZSTD_skippableHeaderSize 8
 
 
 /*-*
-- 
2.25.1



[PATCH 12/13] btrfs: Fix parameter description for functions in extent_io.c

2021-01-19 Thread Nikolay Borisov
This makes the file W=1 clean and fixes the following warnings:

fs/btrfs/extent_io.c:414: warning: Function parameter or member 'tree' not 
described in '__etree_search'
fs/btrfs/extent_io.c:414: warning: Function parameter or member 'offset' not 
described in '__etree_search'
fs/btrfs/extent_io.c:414: warning: Function parameter or member 'next_ret' not 
described in '__etree_search'
fs/btrfs/extent_io.c:414: warning: Function parameter or member 'prev_ret' not 
described in '__etree_search'
fs/btrfs/extent_io.c:414: warning: Function parameter or member 'p_ret' not 
described in '__etree_search'
fs/btrfs/extent_io.c:414: warning: Function parameter or member 'parent_ret' 
not described in '__etree_search'
fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'tree' not 
described in 'find_contiguous_extent_bit'
fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start' not 
described in 'find_contiguous_extent_bit'
fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start_ret' 
not described in 'find_contiguous_extent_bit'
fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'end_ret' not 
described in 'find_contiguous_extent_bit'
fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'bits' not 
described in 'find_contiguous_extent_bit'
fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'tree' not 
described in 'find_first_clear_extent_bit'
fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start' not 
described in 'find_first_clear_extent_bit'
fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start_ret' 
not described in 'find_first_clear_extent_bit'
fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'end_ret' not 
described in 'find_first_clear_extent_bit'
fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'bits' not 
described in 'find_first_clear_extent_bit'
fs/btrfs/extent_io.c:4187: warning: Function parameter or member 'epd' not 
described in 'extent_write_cache_pages'
fs/btrfs/extent_io.c:4187: warning: Excess function parameter 'data' 
description in 'extent_write_cache_pages'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7f689ad7709c..62f892238149 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -392,13 +392,13 @@ static struct rb_node *tree_insert(struct rb_root *root,
  * __etree_search - searche @tree for an entry that contains @offset. Such
  * entry would have entry->start <= offset && entry->end >= offset.
  *
- * @tree - the tree to search
- * @offset - offset that should fall within an entry in @tree
- * @next_ret - pointer to the first entry whose range ends after @offset
- * @prev - pointer to the first entry whose range begins before @offset
- * @p_ret - pointer where new node should be anchored (used when inserting an
+ * @tree:  the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ * @p_ret: pointer where new node should be anchored (used when inserting an
  * entry in the tree)
- * @parent_ret - points to entry which would have been the parent of the entry,
+ * @parent_ret: points to entry which would have been the parent of the entry,
  *   containing @offset
  *
  * This function returns a pointer to the entry that contains @offset byte
@@ -1589,11 +1589,11 @@ int find_first_extent_bit(struct extent_io_tree *tree, 
u64 start,
 
 /**
  * find_contiguous_extent_bit: find a contiguous area of bits
- * @tree - io tree to check
- * @start - offset to start the search from
- * @start_ret - the first offset we found with the bits set
- * @end_ret - the final contiguous range of the bits that were set
- * @bits - bits to look for
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
  *
  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
  * to set bits appropriately, and then merge them again.  During this time it
@@ -1628,11 +1628,11 @@ int find_contiguous_extent_bit(struct extent_io_tree 
*tree, u64 start,
  * find_first_clear_extent_bit - find the first range that has @bits not set.
  * This range could start before @start.
  *
- * @tree - the tree to search
- * @start - the offset at/after which the found extent should start
- * @start_ret - records the beginning of the range
- * @end_ret - records the end of the range (inclusive)
- * @bits - the set of bits which must be unset
+ * @tree: the tree to search
+ * @start: the offset at/after which the found extent should start
+ *

[PATCH 09/13] btrfs: Document btrfs_check_shared parameters

2021-01-19 Thread Nikolay Borisov
Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/backref.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ef71aba5bc15..eca255432a59 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1503,6 +1503,12 @@ int btrfs_find_all_roots(struct btrfs_trans_handle 
*trans,
 /**
  * btrfs_check_shared - tell us whether an extent is shared
  *
+ * @root: root inode belongs to
+ * @inum: inode number of the inode whose extent we are checking
+ * @bytenr: logical bytenr of the extent we are checking
+ * @roots: list of roots this extent is shared among
+ * @tmp: temporary list used for iteration
+ *
  * btrfs_check_shared uses the backref walking code but will short
  * circuit as soon as it finds a root or inode that doesn't match the
  * one passed in. This provides a significant performance benefit for
-- 
2.25.1



[PATCH 08/13] btrfs: Fix description format of fs_info parameter of btrfs_wait_on_delayed_iputs

2021-01-19 Thread Nikolay Borisov
Fixes fs/btrfs/inode.c:3101: warning: Function parameter or member 'fs_info' 
not described in 'btrfs_wait_on_delayed_iputs'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5906b4267204..b6711c207808 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3089,7 +3089,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info 
*fs_info)
 
 /**
  * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- * @fs_info - the fs_info for this fs
+ * @fs_info:  the fs_info for this fs
  * @return - EINTR if we were killed, 0 if nothing's pending
  *
  * This will wait on any delayed iputs that are currently running with KILLABLE
-- 
2.25.1



[PATCH 10/13] btrfs: Fix parameter description of btrfs_inode_rsv_release/btrfs_delalloc_release_space

2021-01-19 Thread Nikolay Borisov
Fixes following warnings:

fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 'inode' 
not described in 'btrfs_inode_rsv_release'
fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 
'qgroup_free' not described in 'btrfs_inode_rsv_release'
fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 'reserved' 
not described in 'btrfs_delalloc_release_space'
fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 
'qgroup_free' not described in 'btrfs_delalloc_release_space'
fs/btrfs/delalloc-space.c:472: warning: Excess function parameter 
'release_bytes' description in 'btrfs_delalloc_release_space'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/delalloc-space.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bacee09b7bfd..c73e5131a2de 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -192,8 +192,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode 
*inode,
 
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
+ * @inode: the inode we need to release from.
+ * @qgroup_free: free or convert qgroup meta.
  *   Unlike normal operation, qgroup meta reservation needs to know if we are
  *   freeing qgroup reservation or just converting it into per-trans.  Normally
  *   @qgroup_free is true for error handling, and false for normal release.
@@ -457,9 +457,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 /**
  * btrfs_delalloc_release_space - release data and metadata space for delalloc
  * @inode: inode we're releasing space for
+ * @reserved: list of changed/reserved ranges
  * @start: start position of the space already reserved
  * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
+ * @qgroup_free: should qgroup reserved-space also be freed
  *
  * This function will release the metadata space that was not used and will
  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
-- 
2.25.1



[PATCH 11/13] btrfs: Fix parameter description in space-info.c

2021-01-19 Thread Nikolay Borisov
With these fixes space-info.c is clearn for W=1 warnings, namely the
following ones are fixed:

fs/btrfs/space-info.c:575: warning: Function parameter or member 'fs_info' not 
described in 'may_commit_transaction'
fs/btrfs/space-info.c:575: warning: Function parameter or member 'space_info' 
not described in 'may_commit_transaction'
fs/btrfs/space-info.c:1231: warning: Function parameter or member 'fs_info' not 
described in 'handle_reserve_ticket'
fs/btrfs/space-info.c:1231: warning: Function parameter or member 'space_info' 
not described in 'handle_reserve_ticket'
fs/btrfs/space-info.c:1231: warning: Function parameter or member 'ticket' not 
described in 'handle_reserve_ticket'
fs/btrfs/space-info.c:1231: warning: Function parameter or member 'flush' not 
described in 'handle_reserve_ticket'
fs/btrfs/space-info.c:1315: warning: Function parameter or member 'fs_info' not 
described in '__reserve_bytes'
fs/btrfs/space-info.c:1315: warning: Function parameter or member 'space_info' 
not described in '__reserve_bytes'
fs/btrfs/space-info.c:1315: warning: Function parameter or member 'orig_bytes' 
not described in '__reserve_bytes'
fs/btrfs/space-info.c:1315: warning: Function parameter or member 'flush' not 
described in '__reserve_bytes'
fs/btrfs/space-info.c:1427: warning: Function parameter or member 'root' not 
described in 'btrfs_reserve_metadata_bytes'
fs/btrfs/space-info.c:1427: warning: Function parameter or member 'block_rsv' 
not described in 'btrfs_reserve_metadata_bytes'
fs/btrfs/space-info.c:1427: warning: Function parameter or member 'orig_bytes' 
not described in 'btrfs_reserve_metadata_bytes'
fs/btrfs/space-info.c:1427: warning: Function parameter or member 'flush' not 
described in 'btrfs_reserve_metadata_bytes'
fs/btrfs/space-info.c:1462: warning: Function parameter or member 'fs_info' not 
described in 'btrfs_reserve_data_bytes'
fs/btrfs/space-info.c:1462: warning: Function parameter or member 'bytes' not 
described in 'btrfs_reserve_data_bytes'
fs/btrfs/space-info.c:1462: warning: Function parameter or member 'flush' not 
described in 'btrfs_reserve_data_bytes'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/space-info.c | 35 +--
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 84fb94e78a8f..55588da66dff 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -562,9 +562,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
 
 /**
  * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
+ * @fs_info: fs context
+ * @space_info: space_info we are checking for commit, either data or metadata
  *
  * This will check to make sure that committing the transaction will actually
  * get us somewhere and then commit the transaction if it does.  Otherwise it
@@ -1216,10 +1215,10 @@ static void wait_reserve_ticket(struct btrfs_fs_info 
*fs_info,
 
 /**
  * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
- * @fs_info - the fs
- * @space_info - the space_info for the reservation
- * @ticket - the ticket for the reservation
- * @flush - how much we can flush
+ * @fs_info: the fs
+ * @space_info: the space_info for the reservation
+ * @ticket: the ticket for the reservation
+ * @flush: how much we can flush
  *
  * This does the work of figuring out how to flush for the ticket, waiting for
  * the reservation, and returning the appropriate error if there is one.
@@ -1297,10 +1296,10 @@ static inline bool is_normal_flushing(enum 
btrfs_reserve_flush_enum flush)
 
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * @fs_info: fs context
+ * @space_info: the space info we want to allocate from
+ * @orig_bytes: the number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
  *
  * This will reserve orig_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -1408,10 +1407,10 @@ static int __reserve_bytes(struct btrfs_fs_info 
*fs_info,
 
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * @root: the root we're allocating for
+ * @block_rsv: the block_rsv we're allocating for
+ * @orig_bytes: the number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
  *
  * This will reserve orig_bytes number of bytes fro

[PATCH 07/13] btrfs: Document fs_info in btrfs_rmap_block

2021-01-19 Thread Nikolay Borisov
Fixes fs/btrfs/block-group.c:1570: warning: Function parameter or member 
'fs_info' not described in 'btrfs_rmap_block'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/block-group.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0886e81e5540..1b71e7be04a9 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1554,6 +1554,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info 
*fs_info, u64 flags)
 
 /**
  * btrfs_rmap_block - Map a physical disk address to a list of logical 
addresses
+ * @fs_info:   fs context
  * @chunk_start:   logical address of block group
  * @physical: physical address to map to logical addresses
  * @logical:  return array of logical addresses which map to @physical
-- 
2.25.1



[PATCH 05/13] btrfs: Improve parameter description for __btrfs_write_out_cache

2021-01-19 Thread Nikolay Borisov
Fixes following W=1 warnings:
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'root' 
not described in '__btrfs_write_out_cache'
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'inode' 
not described in '__btrfs_write_out_cache'
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'ctl' 
not described in '__btrfs_write_out_cache'
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 
'block_group' not described in '__btrfs_write_out_cache'
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 
'io_ctl' not described in '__btrfs_write_out_cache'
fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'trans' 
not described in '__btrfs_write_out_cache'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/free-space-cache.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index fd6ddd6b8165..3ad109114e0c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1300,10 +1300,12 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle 
*trans,
 
 /**
  * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
+ * @root:  the root the inode belongs to
+ * @inode: freespace inode we are writing out
+ * @ctl:  the free space cache we are going to write out
+ * @block_group:  the block_group for this cache if it belongs to a block_group
+ * @io_ctl: holds context for the io
+ * @trans:  the trans handle
  *
  * This function writes out a free space cache struct to disk for quick 
recovery
  * on mount.  This will return 0 if it was successful in writing the cache out,
-- 
2.25.1



Re: [PATCH v4 03/18] btrfs: introduce the skeleton of btrfs_subpage structure

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 06:54:28AM +0800, Qu Wenruo wrote:
> On 2021/1/19 上午6:46, David Sterba wrote:
> > On Sat, Jan 16, 2021 at 03:15:18PM +0800, Qu Wenruo wrote:
> >> +  return;
> >> +
> >> +  subpage = (struct btrfs_subpage *)detach_page_private(page);
> >> +  ASSERT(subpage);
> >> +  kfree(subpage);
> >> +}
> >> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> >> new file mode 100644
> >> index ..96f3b226913e
> >> --- /dev/null
> >> +++ b/fs/btrfs/subpage.h
> >> @@ -0,0 +1,31 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +
> >> +#ifndef BTRFS_SUBPAGE_H
> >> +#define BTRFS_SUBPAGE_H
> >> +
> >> +#include 
> >> +#include "ctree.h"
> >
> > So subpage.h would pull the whole ctree.h, that's not very nice. If
> > anything, the .c could include ctree.h because there are lots of the
> > common structure and function definitions, but not the .h. This creates
> > unnecessary include dependencies.
> >
> > Any pointer type you'd need in structures could be forward declared.
> 
> Unfortunately, the main needed pointer is fs_info, and we're accessing
> it pretty frequently (mostly for sector/node size).
> 
> I don't believe forward declaration would help in this case.

I've looked at the final subpage.h and you add way too many static
inlines that don't seem to be necessary for the reasons the static
inlines are supposed to be used.


Re: [PATCH v4 03/18] btrfs: introduce the skeleton of btrfs_subpage structure

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 04:51:45PM +0100, David Sterba wrote:
> On Tue, Jan 19, 2021 at 06:54:28AM +0800, Qu Wenruo wrote:
> > On 2021/1/19 上午6:46, David Sterba wrote:
> > > On Sat, Jan 16, 2021 at 03:15:18PM +0800, Qu Wenruo wrote:
> > >> +return;
> > >> +
> > >> +subpage = (struct btrfs_subpage *)detach_page_private(page);
> > >> +ASSERT(subpage);
> > >> +kfree(subpage);
> > >> +}
> > >> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> > >> new file mode 100644
> > >> index ..96f3b226913e
> > >> --- /dev/null
> > >> +++ b/fs/btrfs/subpage.h
> > >> @@ -0,0 +1,31 @@
> > >> +/* SPDX-License-Identifier: GPL-2.0 */
> > >> +
> > >> +#ifndef BTRFS_SUBPAGE_H
> > >> +#define BTRFS_SUBPAGE_H
> > >> +
> > >> +#include 
> > >> +#include "ctree.h"
> > >
> > > So subpage.h would pull the whole ctree.h, that's not very nice. If
> > > anything, the .c could include ctree.h because there are lots of the
> > > common structure and function definitions, but not the .h. This creates
> > > unnecessary include dependencies.
> > >
> > > Any pointer type you'd need in structures could be forward declared.
> > 
> > Unfortunately, the main needed pointer is fs_info, and we're accessing
> > it pretty frequently (mostly for sector/node size).
> > 
> > I don't believe forward declaration would help in this case.
> 
> I've looked at the final subpage.h and you add way too many static
> inlines that don't seem to be necessary for the reasons the static
> inlines are supposed to be used.

The only file that includes subpage.h is extent_io.c, so as long as it
stays like that it's manageable. But untangling the include hell still
needs to hapen some day and new code that makes it harder worries me.


[PATCH 01/13] btrfs: Document modified paramater of add_extent_mapping

2021-01-19 Thread Nikolay Borisov
Fixes fs/btrfs/extent_map.c:399: warning: Function parameter or member
'modified' not described in 'add_extent_mapping'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_map.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bd6229fb2b6f..aa1ff6f2ade9 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -388,6 +388,8 @@ static void extent_map_device_clear_bits(struct extent_map 
*em, unsigned bits)
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:  tree to insert new map in
  * @em:map to insert
+ * @modified:  bool indicating whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
  *
  * Insert @em into @tree or perform a simple forward/backward merge with
  * existing mappings.  The extent_map struct passed in will be inserted
-- 
2.25.1



[PATCH 04/13] btrfs: Fix paramter description in delayed-ref.c functions

2021-01-19 Thread Nikolay Borisov
This fixes the following warnings:

fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'fs_info' not 
described in 'btrfs_delayed_refs_rsv_release'
fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'nr' not 
described in 'btrfs_delayed_refs_rsv_release'
fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'fs_info' not 
described in 'btrfs_migrate_to_delayed_refs_rsv'
fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'src' not 
described in 'btrfs_migrate_to_delayed_refs_rsv'
fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'num_bytes' 
not described in 'btrfs_migrate_to_delayed_refs_rsv'
fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'fs_info' not 
described in 'btrfs_delayed_refs_rsv_refill'
fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'flush' not 
described in 'btrfs_delayed_refs_rsv_refill'

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/delayed-ref.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 353cc2994d10..ab8c11f05dca 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -70,8 +70,9 @@ int btrfs_should_throttle_delayed_refs(struct 
btrfs_trans_handle *trans)
 
 /**
  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
- * @fs_info - the fs_info for our fs.
- * @nr - the number of items to drop.
+ *
+ * @fs_info:  the fs_info for our fs.
+ * @nr:  the number of items to drop.
  *
  * This drops the delayed ref head's count from the delayed refs rsv and frees
  * any excess reservation we had.
@@ -115,9 +116,10 @@ void btrfs_update_delayed_refs_rsv(struct 
btrfs_trans_handle *trans)
 
 /**
  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
- * @fs_info - the fs info for our fs.
- * @src - the source block rsv to transfer from.
- * @num_bytes - the number of bytes to transfer.
+ *
+ * @fs_info: the fs info for our fs.
+ * @src:  the source block rsv to transfer from.
+ * @num_bytes: the number of bytes to transfer.
  *
  * This transfers up to the num_bytes amount from the src rsv to the
  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
@@ -163,8 +165,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info 
*fs_info,
 
 /**
  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
- * @fs_info - the fs_info for our fs.
- * @flush - control how we can flush for this reservation.
+ * @fs_info: the fs_info for our fs.
+ * @flush:  control how we can flush for this reservation.
  *
  * This will refill the delayed block_rsv up to 1 items size worth of space and
  * will return -ENOSPC if we can't make the reservation.
-- 
2.25.1



Re: [RFC PATCH 00/37] block: introduce bio_init_fields()

2021-01-19 Thread Josef Bacik

On 1/19/21 12:05 AM, Chaitanya Kulkarni wrote:

Hi,

This is a *compile only RFC* which adds a generic helper to initialize
the various fields of the bio that is repeated all the places in
file-systems, block layer, and drivers.

The new helper allows callers to initialize various members such as
bdev, sector, private, end io callback, io priority, and write hints.

The objective of this RFC is to only start a discussion, this it not
completely tested at all.


It would help to know what you're trying to accomplish here.  I'd echo Mike's 
comments about how it makes it annoying to update things in the future.  In 
addition, there's so many fields that I'm not going to remember what each one is 
without having to look it up, which makes it annoying to use and to review.  If 
it's simply to make sure fields are initialized then you could add debug sanity 
checks to submit_bio().  If it's to clean up duplication, well I'd argue that 
the duplication is much clearer than positional arguments in a giant function 
call.  If you are wanting to change a particular part of the bio to be 
initialized properly, like Dennis's work to make sure the bi_blkg was 
initialized at bi_bdev set time, then a more targeted patch series with a 
specific intent will be more useful and more successful.  Thanks,


Josef


Re: [PATCH v3 1/4] btrfs: add read_policy latency

2021-01-19 Thread Josef Bacik

On 1/11/21 4:41 AM, Anand Jain wrote:

The read policy type latency routes the read IO based on the historical
average wait-time experienced by the read IOs through the individual
device. This patch obtains the historical read IO stats from the kernel
block layer and calculates its average.

Example usage:
  echo "latency" > /sys/fs/btrfs/$uuid/read_policy

Signed-off-by: Anand Jain 
---
v3: The block layer commit 0d02129e76ed (block: merge struct block_device and
 struct hd_struct) has changed the first argument in the function
 part_stat_read_all() in 5.11-rc1. So the compilation will fail. This patch
 fixes it.
 Commit log updated.

v2: Use btrfs_debug_rl() instead of btrfs_info_rl()
 It is better we have this debug until we test this on at least few
 hardwares.
 Drop the unrelated changes.
 Update change log.

v1: Drop part_stat_read_all instead use part_stat_read
 Drop inflight


  fs/btrfs/sysfs.c   |  3 ++-
  fs/btrfs/volumes.c | 38 ++
  fs/btrfs/volumes.h |  2 ++
  3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 19b9fffa2c9c..96ca7bef6357 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -915,7 +915,8 @@ static bool strmatch(const char *buffer, const char *string)
return false;
  }
  
-static const char * const btrfs_read_policy_name[] = { "pid" };

+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", "latency" };
  
  static ssize_t btrfs_read_policy_show(struct kobject *kobj,

  struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4037a6bd926..f7a0a83d2cd4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
  #include 
  #include 
  #include 
+#include 
  #include "misc.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -5490,6 +5491,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
return ret;
  }
  
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,

+ struct map_lookup *map, int first,
+ int num_stripe)
+{
+   u64 est_wait = 0;
+   int best_stripe = 0;
+   int index;
+
+   for (index = first; index < first + num_stripe; index++) {
+   u64 read_wait;
+   u64 avg_wait = 0;
+   unsigned long read_ios;
+   struct btrfs_device *device = map->stripes[index].dev;
+
+   read_wait = part_stat_read(device->bdev, nsecs[READ]);
+   read_ios = part_stat_read(device->bdev, ios[READ]);
+
+   if (read_wait && read_ios && read_wait >= read_ios)
+   avg_wait = div_u64(read_wait, read_ios);
+   else
+   btrfs_debug_rl(device->fs_devices->fs_info,


You can just use fs_info here, you already have it.  I'm not in love with doing 
this check every time, I'd rather cache the results somewhere.  However if we're 
read-only I can't think of a mechanism we could piggy back on, RW we could just 
do it every transaction commit.  Fix the fs_info thing and you can add


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH RFC 4/4] btrfs: introduce new read_policy round-robin

2021-01-19 Thread Josef Bacik

On 1/11/21 4:41 AM, Anand Jain wrote:

Add round-robin read policy to route the read IO to the next device in the
round-robin order. The chunk allocation and thus the stripe-index follows
the order of free space available on devices. So to make the round-robin
effective it shall follow the devid order instead of the stripe-index
order.

Signed-off-by: Anand Jain 
--
RFC because: Provides terrible performance with the fio tests.
I am not yet sure if there is any io workload or a block layer
tuning that shall make this policy better. As of now just an
experimental patch.



Just drop this one, if we can't find a reason to use it then don't bother adding 
the code.  The other options have real world valuable uses, so stick with those. 
 Thanks,


Josef



Re: [PATCH v3 3/4] btrfs: introduce new read_policy device

2021-01-19 Thread Josef Bacik

On 1/11/21 4:41 AM, Anand Jain wrote:

Read-policy type 'device' and device flag 'read-preferred':

The read-policy type device picks the device(s) flagged as
read-preferred for reading stripes of type raid1, raid10,
raid1c3 and raid1c4.

A system might contain SSD, nvme, iscsi, or san lun, and which are all
a non-rotational device, so it is not a good idea to set the read-preferred
automatically. Instead, device read-policy along with the read-preferred
flag provides an ability to do it manually. This advanced tuning is useful
in more than one situation, for example,
  - In heterogeneous-disk volume, it provides an ability to manually choose
 the low latency disks for reading.
  - Useful for more accurate testing.
  - Avoid known problematic device from reading the chunk until it is
replaced (by marking the other good devices as read-preferred).

Note:

If the read-policy type is set to 'device', but there isn't any device
which is flagged as read-preferred, then stripe 0 is used for reading.

The device replacement won't migrate the read-preferred flag to the new
replace the target device.

As of now, this is an in-memory only feature.

It's pointless to set the read-preferred flag on the missing device, as
IOs aren't submitted to the missing device.

If there is more than one read-preferred device in a chunk, the read IO
shall go to the stripe 0 as of now.

Usage example:

Consider a typical two disks raid1.

Configure devid1 for reading.

$ echo 1 > devinfo/1/read_preferred
$ cat devinfo/1/read_preferred
1
$ cat devinfo/2/read_preferred
0

$ pwd
/sys/fs/btrfs/12345678-1234-1234-1234-123456789abc

$ cat read_policy
[pid] device
$ echo device > ./read_policy
$ cat read_policy
pid [device]

Now read IOs are sent to devid 1 (sdb).

$ echo 3 > /proc/sys/vm/drop_caches
$ md5sum /btrfs/YkZI

$ iostat -zy 1 | egrep 'sdb|sdc' (from another terminal)
sdb  50.00 40048.00 0.00  40048  0

Change the read-preferred device from devid 1 to devid 2 (sdc).

$ echo 0 > ./devinfo/1/read_preferred

[ 3343.918658] BTRFS info (device sdb): reset read preferred on devid 1 (1334)

$ echo 1 > ./devinfo/2/read_preferred

[ 3343.919876] BTRFS info (device sdb): set read preferred on devid 2 (1334)

$ echo 3 > /proc/sys/vm/drop_caches
$ md5sum /btrfs/YkZI

Further read ios are sent to devid 2 (sdc).

$ iostat -zy 1 | egrep 'sdb|sdc' (from another terminal)
sdc  49.00 40048.00 0.00  40048  0

Signed-off-by: Anand Jain 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v4 08/18] btrfs: introduce helper for subpage uptodate status

2021-01-19 Thread David Sterba
On Sat, Jan 16, 2021 at 03:15:23PM +0800, Qu Wenruo wrote:
> This patch introduce the following functions to handle btrfs subpage
> uptodate status:
> - btrfs_subpage_set_uptodate()
> - btrfs_subpage_clear_uptodate()
> - btrfs_subpage_test_uptodate()
>   Those helpers can only be called when the range is ensured to be
>   inside the page.
> 
> - btrfs_page_set_uptodate()
> - btrfs_page_clear_uptodate()
> - btrfs_page_test_uptodate()
>   Those helpers can handle both regular sector size and subpage without
>   problem.
>   Although caller should still ensure that the range is inside the page.
> 
> Signed-off-by: Qu Wenruo 
> ---
>  fs/btrfs/subpage.h | 115 +
>  1 file changed, 115 insertions(+)
> 
> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
> index d8b34879368d..3373ef4ffec1 100644
> --- a/fs/btrfs/subpage.h
> +++ b/fs/btrfs/subpage.h
> @@ -23,6 +23,7 @@
>  struct btrfs_subpage {
>   /* Common members for both data and metadata pages */
>   spinlock_t lock;
> + u16 uptodate_bitmap;
>   union {
>   /* Structures only used by metadata */
>   bool under_alloc;
> @@ -78,4 +79,118 @@ static inline void btrfs_page_end_meta_alloc(struct 
> btrfs_fs_info *fs_info,
>  int btrfs_attach_subpage(struct btrfs_fs_info *fs_info, struct page *page);
>  void btrfs_detach_subpage(struct btrfs_fs_info *fs_info, struct page *page);
>  
> +/*
> + * Convert the [start, start + len) range into a u16 bitmap
> + *
> + * E.g. if start == page_offset() + 16K, len = 16K, we get 0x00f0.
> + */
> +static inline u16 btrfs_subpage_calc_bitmap(struct btrfs_fs_info *fs_info,

All the API functions should use const for data that are only read,
fs_info in this case at least.

> + struct page *page, u64 start, u32 len)
> +{
> + int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
> + int nbits = len >> fs_info->sectorsize_bits;
> +
> + /* Basic checks */
> + ASSERT(PagePrivate(page) && page->private);
> + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
> +IS_ALIGNED(len, fs_info->sectorsize));
> +
> + /*
> +  * The range check only works for mapped page, we can
> +  * still have unampped page like dummy extent buffer pages.
> +  */
> + if (page->mapping)
> + ASSERT(page_offset(page) <= start &&
> + start + len <= page_offset(page) + PAGE_SIZE);
> + /*
> +  * Here nbits can be 16, thus can go beyond u16 range. Here we make the
> +  * first left shift to be calculated in unsigned long (u32), then
> +  * truncate the result to u16.
> +  */
> + return (u16)(((1UL << nbits) - 1) << bit_start);
> +}
> +
> +static inline void btrfs_subpage_set_uptodate(struct btrfs_fs_info *fs_info,
> + struct page *page, u64 start, u32 len)
> +{
> + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
> + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&subpage->lock, flags);
> + subpage->uptodate_bitmap |= tmp;
> + if (subpage->uptodate_bitmap == U16_MAX)
> + SetPageUptodate(page);
> + spin_unlock_irqrestore(&subpage->lock, flags);
> +}
> +
> +static inline void btrfs_subpage_clear_uptodate(struct btrfs_fs_info 
> *fs_info,
> + struct page *page, u64 start, u32 len)
> +{
> + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
> + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&subpage->lock, flags);
> + subpage->uptodate_bitmap &= ~tmp;
> + ClearPageUptodate(page);
> + spin_unlock_irqrestore(&subpage->lock, flags);
> +}
> +
> +/*
> + * Unlike set/clear which is dependent on each page status, for test all bits
> + * are tested in the same way.
> + */
> +#define DECLARE_BTRFS_SUBPAGE_TEST_OP(name)  \
> +static inline bool btrfs_subpage_test_##name(struct btrfs_fs_info *fs_info, \
> + struct page *page, u64 start, u32 len)  \
> +{\
> + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
> + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
> + unsigned long flags;\
> + bool ret;   \
> + \
> + spin_lock_irqsave(&subpage->lock, flags);   \
> + ret = ((subpage->name##_bitmap & tmp) == tmp);  \
> + spin_unlock_irqrestore(&subpage->lock, flags);  \
> + return ret; \
> +}
> +DECLARE_BTRFS_SUBPAGE_TEST_OP(uptodate);
> 

Re: [PATCH v3 2/4] btrfs: introduce new device-state read_preferred

2021-01-19 Thread Josef Bacik

On 1/11/21 4:41 AM, Anand Jain wrote:

This is a preparatory patch and introduces a new device flag
'read_preferred', RW-able using sysfs interface.

Signed-off-by: Anand Jain 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v4 16/18] btrfs: introduce btrfs_subpage for data inodes

2021-01-19 Thread David Sterba
On Sat, Jan 16, 2021 at 03:15:31PM +0800, Qu Wenruo wrote:
> -void set_page_extent_mapped(struct page *page)
> +int __must_check set_page_extent_mapped(struct page *page)

We're not using the __must_check, errors from such functions need to be
handled by default so I've dropped the attribute.


Re: [PATCH 01/13] btrfs: Document modified paramater of add_extent_mapping

2021-01-19 Thread Josef Bacik

On 1/19/21 7:26 AM, Nikolay Borisov wrote:

Fixes fs/btrfs/extent_map.c:399: warning: Function parameter or member
'modified' not described in 'add_extent_mapping'

Signed-off-by: Nikolay Borisov 


Subject should be 'parameter'.  Thanks,

Josef


Re: [PATCH 04/13] btrfs: Fix paramter description in delayed-ref.c functions

2021-01-19 Thread Josef Bacik

On 1/19/21 7:26 AM, Nikolay Borisov wrote:

This fixes the following warnings:



Subject should read 'parameter', thanks,

Josef


Re: [PATCH 01/13] btrfs: Document modified paramater of add_extent_mapping

2021-01-19 Thread Josef Bacik

On 1/19/21 7:26 AM, Nikolay Borisov wrote:

Fixes fs/btrfs/extent_map.c:399: warning: Function parameter or member
'modified' not described in 'add_extent_mapping'

Signed-off-by: Nikolay Borisov 
---
  fs/btrfs/extent_map.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bd6229fb2b6f..aa1ff6f2ade9 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -388,6 +388,8 @@ static void extent_map_device_clear_bits(struct extent_map 
*em, unsigned bits)
   * add_extent_mapping - add new extent map to the extent tree
   * @tree: tree to insert new map in
   * @em:   map to insert
+ * @modified:  bool indicating whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
   *
   * Insert @em into @tree or perform a simple forward/backward merge with
   * existing mappings.  The extent_map struct passed in will be inserted



You can add

Reviewed-by: Josef Bacik 

to the series once you fix up those two spelling mistakes.  Thanks,

Josef


Re: [PATCH v4 02/18] btrfs: merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK into PAGE_START_WRITEBACK

2021-01-19 Thread Josef Bacik

On 1/16/21 2:15 AM, Qu Wenruo wrote:

PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK are two macros used in
__process_pages_contig(), to inform the function to clear page dirty and
then set page writeback.

However page write back and dirty are two conflict status (at least for
sector size == PAGE_SIZE case), this means those two macros are always
called together.

This means we can merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK, into
one macro, PAGE_START_WRITEBACK.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c |  4 ++--
  fs/btrfs/extent_io.h | 12 ++--
  fs/btrfs/inode.c | 28 ++--
  3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3442f1746683..a816ba4a8537 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1970,10 +1970,10 @@ static int __process_pages_contig(struct address_space 
*mapping,
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
  
-			if (page_ops & PAGE_CLEAR_DIRTY)

+   if (page_ops & PAGE_START_WRITEBACK) {
clear_page_dirty_for_io(pages[i]);
-   if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+   }
if (page_ops & PAGE_SET_ERROR)
SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19221095c635..bedf761a0300 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,12 +35,12 @@ enum {
  
  /* these are flags for __process_pages_contig */

  #define PAGE_UNLOCK   (1 << 0)
-#define PAGE_CLEAR_DIRTY   (1 << 1)
-#define PAGE_SET_WRITEBACK (1 << 2)
-#define PAGE_END_WRITEBACK (1 << 3)
-#define PAGE_SET_PRIVATE2  (1 << 4)
-#define PAGE_SET_ERROR (1 << 5)
-#define PAGE_LOCK  (1 << 6)
+/* This one will clera page dirty and then set paeg writeback */

^ 
clear page

Sorry for some reason I missed this, then you can add my reviewed by from my 
previous reply.  Thanks,


Josef


Re: [PATCH v4 02/18] btrfs: merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK into PAGE_START_WRITEBACK

2021-01-19 Thread Josef Bacik

On 1/16/21 2:15 AM, Qu Wenruo wrote:

PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK are two macros used in
__process_pages_contig(), to inform the function to clear page dirty and
then set page writeback.

However page write back and dirty are two conflict status (at least for
sector size == PAGE_SIZE case), this means those two macros are always
called together.

This means we can merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK, into
one macro, PAGE_START_WRITEBACK.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v4 01/18] btrfs: update locked page dirty/writeback/error bits in __process_pages_contig()

2021-01-19 Thread Josef Bacik

On 1/16/21 2:15 AM, Qu Wenruo wrote:

When __process_pages_contig() get called for
extent_clear_unlock_delalloc(), if we hit the locked page, only Private2
bit is updated, but dirty/writeback/error bits are all skipped.

There are several call sites call extent_clear_unlock_delalloc() with
@locked_page and PAGE_CLEAR_DIRTY/PAGE_SET_WRITEBACK/PAGE_END_WRITEBACK

- cow_file_range()
- run_delalloc_nocow()
- cow_file_range_async()
   All for their error handling branches.

For those call sites, since we skip the locked page for
dirty/error/writeback bit update, the locked page will still have its
dirty bit remaining.

Thankfully, since all those call sites can only be hit with various
serious errors, it's pretty hard to hit and shouldn't affect regular
btrfs operations.

But still, we shouldn't leave the locked_page with its
dirty/error/writeback bits untouched.

Fix this by only skipping lock/unlock page operations for locked_page.

Signed-off-by: Qu Wenruo 


Except this is handled by the callers.  We clear_page_dirty_for_io() the page 
before calling btrfs_run_delalloc_range(), so we don't need the 
PAGE_CLEAR_DIRTY, it's already cleared.  The SetPageError() is handled in the 
error path for locked_page, as is the set_writeback/end_writeback.  Now I don't 
think this patch causes problems specifically, but the changelog is at least 
wrong, and I'd rather we'd skip the handling of the locked_page here and leave 
it in the proper error handling.  If you need to do this for some other reason 
that I haven't gotten to yet then you need to make that clear in the changelog, 
because as of right now I don't see why this is needed.  Thanks,


Josef


Re: [PATCH v4 04/18] btrfs: make attach_extent_buffer_page() to handle subpage case

2021-01-19 Thread Josef Bacik

On 1/16/21 2:15 AM, Qu Wenruo wrote:

For subpage case, we need to allocate new memory for each metadata page.

So we need to:
- Allow attach_extent_buffer_page() to return int
   To indicate allocation failure

- Prealloc btrfs_subpage structure for alloc_extent_buffer()
   We don't want to call memory allocation with spinlock hold, so
   do preallocation before we acquire mapping->private_lock.

- Handle subpage and regular case differently in
   attach_extent_buffer_page()
   For regular case, just do the usual thing.
   For subpage case, allocate new memory or use the preallocated memory.

For future subpage metadata, we will make more usage of radix tree to
grab extnet buffer.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 75 ++--
  fs/btrfs/subpage.h   | 17 ++
  2 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a816ba4a8537..320731487ac0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,7 @@
  #include "rcu-string.h"
  #include "backref.h"
  #include "disk-io.h"
+#include "subpage.h"
  
  static struct kmem_cache *extent_state_cache;

  static struct kmem_cache *extent_buffer_cache;
@@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
return ret;
  }
  
-static void attach_extent_buffer_page(struct extent_buffer *eb,

- struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
  {
+   struct btrfs_fs_info *fs_info = eb->fs_info;
+   int ret;


int ret = 0;


+
/*
 * If the page is mapped to btree inode, we should hold the private
 * lock to prevent race.
@@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
extent_buffer *eb,
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
  
-	if (!PagePrivate(page))

-   attach_page_private(page, eb);
-   else
-   WARN_ON(page->private != (unsigned long)eb);
+   if (fs_info->sectorsize == PAGE_SIZE) {
+   if (!PagePrivate(page))
+   attach_page_private(page, eb);
+   else
+   WARN_ON(page->private != (unsigned long)eb);
+   return 0;
+   }
+
+   /* Already mapped, just free prealloc */
+   if (PagePrivate(page)) {
+   kfree(prealloc);
+   return 0;
+   }
+
+   if (prealloc) {
+   /* Has preallocated memory for subpage */
+   spin_lock_init(&prealloc->lock);
+   attach_page_private(page, prealloc);
+   } else {
+   /* Do new allocation to attach subpage */
+   ret = btrfs_attach_subpage(fs_info, page);
+   if (ret < 0)
+   return ret;


Delete the above 2 lines.


+   }
+
+   return 0;


return ret;


  }
  
  void set_page_extent_mapped(struct page *page)

@@ -5062,21 +5089,29 @@ struct extent_buffer *btrfs_clone_extent_buffer(const 
struct extent_buffer *src)
if (new == NULL)
return NULL;
  
+	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);

+   set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+


Why are you doing this here?  It seems unrelated?  Looking at the code it 
appears there's a reason for this later, but I had to go look to make sure I 
wasn't crazy, so at the very least it needs to be done in a more relevant patch.



for (i = 0; i < num_pages; i++) {
+   int ret;
+
p = alloc_page(GFP_NOFS);
if (!p) {
btrfs_release_extent_buffer(new);
return NULL;
}
-   attach_extent_buffer_page(new, p);
+   ret = attach_extent_buffer_page(new, p, NULL);
+   if (ret < 0) {
+   put_page(p);
+   btrfs_release_extent_buffer(new);
+   return NULL;
+   }
WARN_ON(PageDirty(p));
SetPageUptodate(p);
new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
  
-	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);

-   set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
  
  	return new;

  }
@@ -5308,12 +5343,28 @@ struct extent_buffer *alloc_extent_buffer(struct 
btrfs_fs_info *fs_info,
  
  	num_pages = num_extent_pages(eb);

for (i = 0; i < num_pages; i++, index++) {
+   struct btrfs_subpage *prealloc = NULL;
+
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
  
+		/*

Cannot mount BTRFS filesystem following power cycle

2021-01-19 Thread Provost, Simon
Hi,
We're having an issue with our BTRFS file system.
Our systems are being shut down by cutting power. Sometimes, when the system 
gets powered on, the BTRFS partition fails to mount. System reports kernel bug 
fs/btrfs/extent-tree.c:1183. When that occurs, we're able to mount it using 
both ro and nologreplay flags, which suggests a problem replaying the log on 
startup. We were able to reproduce the problem by using a controlled power bar 
and doing automatic power cycle once every few minutes while writing data on 
the file system. The problem is very hard to reproduce but we've seen it 3 
times on 3 different systems. Our BTRFS partition uses a single volume, however 
that volume is backed by a hardware RAID. You'll find attached the output of 
the dmesg command.
Thank you for your assistance
Simon

Output of various commands. Note that those were taken while the BTRFS 
filesystem was mounted with ro and nologreplay
gvoc-81 [/]#   uname -a
Linux gvoc-81 5.4.34-0-lts #1-Alpine SMP Wed, 22 Apr 2020 19:26:07 UTC x86_64 
Linux
gvoc-81 [/]#   btrfs --version
btrfs-progs v5.4
gvoc-81 [/]#   btrfs fi show
Label: none  uuid: a895f1da-4ccc-4d5a-b17e-158072ae54ef
Total devices 1 FS bytes used 68.60GiB
devid1 size 883.98GiB used 82.07GiB path /dev/sda5

gvoc-81 [/]#   btrfs fi df /data
Data, single: total=76.01GiB, used=67.12GiB
System, DUP: total=32.00MiB, used=16.00KiB
Metadata, DUP: total=3.00GiB, used=1.48GiB
GlobalReserve, single: total=202.97MiB, used=0.00B

**
DISCLAIMER:
Privileged and/or Confidential information may be contained in this message. If 
you are not the addressee of this message, you may not copy, use or deliver 
this message to anyone. In such event, you should destroy the message and 
kindly notify the sender by reply e-mail. It is understood that opinions or 
conclusions that do not relate to the official business of the company are 
neither given nor endorsed by the company. Thank You.


dmesg.log
Description: dmesg.log


[PATCH 00/13] Make btrfs W=1 clean

2021-01-19 Thread Nikolay Borisov
Hello,

This patch series aims to fix all current warnings produced by compiling btrfs
with W=1. My hopes are with these additions W=1 can become a default build
options for btrfs. With this series applied misc-next currently produces 1
genuine warning for an unused variable:

fs/btrfs/zoned.c: In function ‘btrfs_sb_log_location_bdev’:
fs/btrfs/zoned.c:491:6: warning: variable ‘zone_size’ set but not used 
[-Wunused-but-set-variable]
  491 |  u64 zone_size;


But I haven't fixed it since it's part of the WIP zoned support.

Nikolay Borisov (13):
  btrfs: Document modified paramater of add_extent_mapping
  btrfs: Fix parameter description of btrfs_add_extent_mapping
  btrfs: Fix function description format
  btrfs: Fix paramter description in delayed-ref.c functions
  btrfs: Improve parameter description for __btrfs_write_out_cache
  btrfs: Document now parameter of peek_discard_list
  btrfs: Document fs_info in btrfs_rmap_block
  btrfs: Fix description format of fs_info parameter of
btrfs_wait_on_delayed_iputs
  btrfs: Document btrfs_check_shared parameters
  btrfs: Fix parameter description of
btrfs_inode_rsv_release/btrfs_delalloc_release_space
  btrfs: Fix parameter description in space-info.c
  btrfs: Fix parameter description for functions in extent_io.c
  lib/zstd: Convert constants to defines

 fs/btrfs/backref.c  |  6 ++
 fs/btrfs/block-group.c  |  1 +
 fs/btrfs/delalloc-space.c   |  7 ---
 fs/btrfs/delayed-ref.c  | 16 +---
 fs/btrfs/discard.c  |  1 +
 fs/btrfs/extent_io.c| 34 +-
 fs/btrfs/extent_map.c   | 12 +++-
 fs/btrfs/file-item.c| 22 ++
 fs/btrfs/free-space-cache.c | 10 ++
 fs/btrfs/inode.c|  2 +-
 fs/btrfs/space-info.c   | 35 +--
 include/linux/zstd.h|  8 
 12 files changed, 87 insertions(+), 67 deletions(-)

--
2.25.1



Re: "bad tree block start" when trying to mount on ARM

2021-01-19 Thread Erik Jensen
On Mon, Jan 18, 2021 at 9:22 PM Erik Jensen  wrote:
>
> On Mon, Jan 18, 2021 at 4:12 AM Erik Jensen  wrote:
> >
> > The offending system is indeed ARMv7 (specifically a Marvell ARMADA®
> > 388), but I believe the Broadcom BCM2835 in my Raspberry Pi is
> > actually ARMv6 (with hardware float support).
>
> Using NBD, I have verified that I receive the same error when
> attempting to mount the filesystem on my ARMv6 Raspberry Pi:
> [ 3491.339572] BTRFS info (device dm-4): disk space caching is enabled
> [ 3491.394584] BTRFS info (device dm-4): has skinny extents
> [ 3492.385095] BTRFS error (device dm-4): bad tree block start, want
> 26207780683776 have 3395945502747707095
> [ 3492.514071] BTRFS error (device dm-4): bad tree block start, want
> 26207780683776 have 3395945502747707095
> [ 3492.553599] BTRFS warning (device dm-4): failed to read tree root
> [ 3492.865368] BTRFS error (device dm-4): open_ctree failed
>
> The Raspberry Pi is running Linux 5.4.83.
>

Okay, after some more testing, ARM seems to be irrelevant, and 32-bit
is the key factor. On a whim, I booted up an i686, 5.8.14 kernel in a
VM, attached the drives via NBD, ran cryptsetup, tried to mount, and…
I got the exact same error message.


Re: [PATCH v4 04/18] btrfs: make attach_extent_buffer_page() to handle subpage case

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 04:54:28PM -0500, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
> > +/* For rare cases where we need to pre-allocate a btrfs_subpage structure 
> > */
> > +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
> > + struct btrfs_subpage **ret)
> > +{
> > +   if (fs_info->sectorsize == PAGE_SIZE)
> > +   return 0;
> > +
> > +   *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
> > +   if (!*ret)
> > +   return -ENOMEM;
> > +   return 0;
> > +}
> 
> We're allocating these for every metadata page, that deserves a dedicated 
> kmem_cache.  Thanks,

I'm not opposed to that idea but for the first implementation I'm ok
with using the default slabs. As the subpage support depends on the
filesystem, creating the cache unconditionally would waste resources and
creating it on demand would need some care. Either way I'd rather see it
in a separate patch.


Re: [PATCH 03/13] btrfs: Fix function description format

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 02:26:39PM +0200, Nikolay Borisov wrote:
> This fixes following W=1 warnings:
> 
> fs/btrfs/file-item.c:27: warning: Cannot understand  * @inode:  the inode we 
> want to update the disk_i_size for
>  on line 27 - I thought it was a doc line
> fs/btrfs/file-item.c:65: warning: Cannot understand  * @inode - the inode 
> we're modifying
>  on line 65 - I thought it was a doc line
> fs/btrfs/file-item.c:91: warning: Cannot understand  * @inode - the inode 
> we're modifying
>  on line 91 - I thought it was a doc line
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/file-item.c | 22 ++
>  1 file changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
> index 6ccfc019ad90..868b27e887b1 100644
> --- a/fs/btrfs/file-item.c
> +++ b/fs/btrfs/file-item.c
> @@ -24,8 +24,10 @@
>  PAGE_SIZE))
>  
>  /**
> - * @inode - the inode we want to update the disk_i_size for
> - * @new_i_size - the i_size we want to set to, 0 if we use i_size
> + * btrfs_inode_safe_disk_i_size_write

This is the part I don't like about kdoc but as we've talked about that,
we can use a less strict formatting and follow this scheme:

/**
 * Function summary description in one or two sentences.
 *
 * @param1:description
 * @param2:description
 *
 * Long description of the function.
 *
 * Return value semantics
 */

The kdoc checker is fine with the first line and validates the
arguments, which is what we want. The validator can be enabled at build
time as well by adding

cmd_checkdoc = $(srctree)/scripts/kernel-doc -none $<

to our Makefile.

> + *
> + * @inode:  the inode we want to update the disk_i_size for
> + * @new_i_size: the i_size we want to set to, 0 if we use i_size

Please align the descriptions


Re: [PATCH 06/13] btrfs: Document now parameter of peek_discard_list

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 02:26:42PM +0200, Nikolay Borisov wrote:
> Fixes fs/btrfs/discard.c:203: warning: Function parameter or member 'now' not 
> described in 'peek_discard_list'
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/discard.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> index 2b8383d41144..bfe53eb4c1f3 100644
> --- a/fs/btrfs/discard.c
> +++ b/fs/btrfs/discard.c
> @@ -189,6 +189,7 @@ static struct btrfs_block_group *find_next_block_group(
>   * @discard_ctl: discard control
>   * @discard_state: the discard_state of the block_group after state 
> management
>   * @discard_index: the discard_index of the block_group after state 
> management
> + * @now: Time when discard was invoked, in ns

Please follow the formatting, no uppercase letter in 'Time'

>   *
>   * This wraps find_next_block_group() and sets the block_group to be in use.
>   * discard_state's control flow is managed here.  Variables related to
> -- 
> 2.25.1


Re: [PATCH 00/13] Make btrfs W=1 clean

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 02:26:36PM +0200, Nikolay Borisov wrote:
> Hello,
> 
> This patch series aims to fix all current warnings produced by compiling btrfs
> with W=1. My hopes are with these additions W=1 can become a default build
> options for btrfs.

Great, I like that, some of the warnigs get caught by the build bots but
we want them caught at development time.


Re: [PATCH 07/13] btrfs: Document fs_info in btrfs_rmap_block

2021-01-19 Thread David Sterba
On Tue, Jan 19, 2021 at 02:26:43PM +0200, Nikolay Borisov wrote:
> Fixes fs/btrfs/block-group.c:1570: warning: Function parameter or member 
> 'fs_info' not described in 'btrfs_rmap_block'
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/block-group.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 0886e81e5540..1b71e7be04a9 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1554,6 +1554,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info 
> *fs_info, u64 flags)
>  
>  /**
>   * btrfs_rmap_block - Map a physical disk address to a list of logical 
> addresses
> + * @fs_info:   fs context

I think documenting the fs_info parameter is awkward in all cases, so
we could just put 'the filesystem' everywhere. Here 'context' could be
confusing as there aready is something called fs_context.

>   * @chunk_start:   logical address of block group
>   * @physical:   physical address to map to logical addresses
>   * @logical:return array of logical addresses which map to @physical
> -- 
> 2.25.1


Re: [RFC PATCH 00/37] block: introduce bio_init_fields()

2021-01-19 Thread Mike Snitzer
On Tue, Jan 19 2021 at 12:05am -0500,
Chaitanya Kulkarni  wrote:

> Hi,
> 
> This is a *compile only RFC* which adds a generic helper to initialize
> the various fields of the bio that is repeated all the places in
> file-systems, block layer, and drivers.
> 
> The new helper allows callers to initialize various members such as
> bdev, sector, private, end io callback, io priority, and write hints.
> 
> The objective of this RFC is to only start a discussion, this it not 
> completely tested at all.                                   
>                                                     
>                      
> Following diff shows code level benefits of this helper :-
>  38 files changed, 124 insertions(+), 236 deletions(-)


Please no... this is just obfuscation.

Adding yet another field to set would create a cascade of churn
throughout kernel (and invariably many callers won't need the new field
initialized, so you keep passing 0 for more and more fields).

Nacked-by: Mike Snitzer 



Re: [RFC][PATCH V5] btrfs: preferred_metadata: preferred device for metadata

2021-01-19 Thread Zygo Blaxell
On Sun, Jan 17, 2021 at 07:54:30PM +0100, Goffredo Baroncelli wrote:
> 
> Hi all,
> 
> This is an RFC; I wrote this patch because I find the idea interesting
> even though it adds more complication to the chunk allocator.
> 
> The basic idea is to store the metadata chunk in the fasters disks.
> The fasters disk are marked by the "preferred_metadata" flag.
> 
> BTRFS when allocate a new metadata/system chunk, selects the
> "preferred_metadata" disks, otherwise it selectes the non
> "preferred_metadata" disks. The intial patch allowed to use the other
> kind of disk in case a set is full.
> 
> This patches set is based on v5.11-rc2.
> 
> For now, the only user of this patch that I am aware is Zygo. 
> However he asked to further constraint the allocation: i.e. avoid to
> allocated metadata on a not "preferred_metadata"
> disk. So I extended the patch adding 4 modes to operate.
> 
> This is enabled passing the option "preferred_metadata=" at
> mount time. 
> 
> There are 4 modes:
> - preferred_metadata=disabled
>   The allocator is the standard one.
> 
> - preferred_metadata=soft
>   The metadata chunk are allocated on the disks marked with the
>   "preferred_metadata" flag.
>   The data chunk are allocated on the disks not marked with the
>   "preferred_metadata" flag.
>   If the space isn't enough, then it is possible to use the other kind
>   of disks.
> 
> - preferred_metadata=hard
>   The metadata chunk are allocated on the disks marked with the
>   "preferred_metadata" flag.
>   The data chunk are allocated on the disks not marked with the
>   "preferred_metadata" flag.
>   If the space isn't enough, then "no space left" error is raised. It
>   is not possible to use the other kind of disks.
> 
> - preferred_metadata=metadata
>   The metadata chunk are allocated on the disks marked with the
>   "preferred_metadata" flag.
>   For metadata, if the space isn't enough, then it is possible to use the
>   other kind of disks.
>   The data chunk are allocated on the disks not marked with the
>   "preferred_metadata" flag.
>   For data, if the space isn't enough, then "no space left" error is raised.
>   It is not possible to use the other kind of disks.

I like this form of the preferred_metadata mount option even less than
the last one.  Now we have 4 different mount option cases, and we still
can't specify some things that are possible with 4 device properties.
Ideally there's no mount option at all, and we handle everything with
properties.

e.g. if I had one fast but small NVME, one slow but large SATA SSD, and
one big spinning HDD, I can't set up the following with the mount option:

- metadata only on NVME (type 3)
- metadata preferred, data allowed on SSD (type 1)
- data only on HDD (type 2)

The relationship between NVME and SSD is "metadata" but between NVME and
HDD it is "hard", which are conflicting mount options.  I can't specify
the relationship between SSD and HDD (metadata must not overflow, but
data is allowed to overflow) with any of the mount options, because you
have not provided a mount option where data may overflow onto preferred
(fast) disks while metadata must not use slow disks.

With the 4 device types we can trivially specify this arrangement.

The sorted device lists would be:

Metadata sort order Data sort order
metadata only (3)   data only (2)
metadata preferred (1)  data preferred (0)
data preferred (0)  metadata preferred (1)
other devices (2 or other)  other devices (3 or other)

We keep 3 device counts for the first 3 sort orders.  If the number of all
preferred devices (type != 0) is zero, we just return ndevs; otherwise,
we pick the first device count that is >= mindevs.  If all the device
counts are < mindevs then we return the 3rd count (metadata only +
metadata preferred + data preferred) and the caller will find ENOSPC.

More sophisticated future implementations can alter the sort order, or
operate in entirely separate parts of btrfs, without conflicting with
this scheme.  If there is no mount option, then future implementations
can't conflict with it.

> A separate patches set is sent to extend the "btrfs property" command
> for supporting the preferred_metadata device flag. The basic usage is:
> 
> $ # set a new value
> $ sudo btrfs property set /dev/vde preferred_metadata 1
> 
> $ # get the current value
> $ sudo btrfs property get /dev/vde preferred_metadata
> devid=4, path=/dev/vde: dedicated_metadata=1
> 
> Some examples: (/dev/sd[abc] are marked as preferred_metadata,
> and /dev/sd[ef] are not)
> 
> Non striped profile: metadata->raid1, data->raid1
> The data is stored on /dev/sd[ef], metadata is stored on /dev/sd[abc].
> 
> If mode is one of "soft" or "disabled", when /dev/sd[ef] are full
> the data chunk is allocated also on /dev/sd[abc]. Otherwise -ENOSPACE
> is raised.
> 
> Striped profile: metadata->raid6, data->raid6
>

Re: [RFC PATCH 04/37] btrfs: use bio_init_fields in volumes

2021-01-19 Thread Nikolay Borisov



On 19.01.21 г. 7:05 ч., Chaitanya Kulkarni wrote:
> Signed-off-by: Chaitanya Kulkarni 
> ---
>  fs/btrfs/volumes.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index ee086fc56c30..836167212252 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6371,14 +6371,12 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, 
> struct bio *bio,
>  
>   bio->bi_private = bbio;
This line can be removed as ->private is initialized by bio_init_fields.

>   btrfs_io_bio(bio)->device = dev;
> - bio->bi_end_io = btrfs_end_bio;
> - bio->bi_iter.bi_sector = physical >> 9;
> + bio_init_fields(bio, dev->bdev, physical >> 9, bbio, btrfs_end_bio, 0, 
> 0);
>   btrfs_debug_in_rcu(fs_info,
>   "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
>   bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
>   (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
>   dev->devid, bio->bi_iter.bi_size);
> - bio_set_dev(bio, dev->bdev);
>  
>   btrfs_bio_counter_inc_noblocked(fs_info);
>  
> 


Re: [PATCH v4 03/18] btrfs: introduce the skeleton of btrfs_subpage structure

2021-01-19 Thread Qu Wenruo




On 2021/1/20 上午12:06, David Sterba wrote:

On Tue, Jan 19, 2021 at 04:51:45PM +0100, David Sterba wrote:

On Tue, Jan 19, 2021 at 06:54:28AM +0800, Qu Wenruo wrote:

On 2021/1/19 上午6:46, David Sterba wrote:

On Sat, Jan 16, 2021 at 03:15:18PM +0800, Qu Wenruo wrote:

+   return;
+
+   subpage = (struct btrfs_subpage *)detach_page_private(page);
+   ASSERT(subpage);
+   kfree(subpage);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
new file mode 100644
index ..96f3b226913e
--- /dev/null
+++ b/fs/btrfs/subpage.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUBPAGE_H
+#define BTRFS_SUBPAGE_H
+
+#include 
+#include "ctree.h"


So subpage.h would pull the whole ctree.h, that's not very nice. If
anything, the .c could include ctree.h because there are lots of the
common structure and function definitions, but not the .h. This creates
unnecessary include dependencies.

Any pointer type you'd need in structures could be forward declared.


Unfortunately, the main needed pointer is fs_info, and we're accessing
it pretty frequently (mostly for sector/node size).

I don't believe forward declaration would help in this case.


I've looked at the final subpage.h and you add way too many static
inlines that don't seem to be necessary for the reasons the static
inlines are supposed to be used.


The only file that includes subpage.h is extent_io.c, so as long as it
stays like that it's manageable. But untangling the include hell still
needs to hapen some day and new code that makes it harder worries me.


If going through the github branch, you will see there are more files
using subpage.h:
- extent_io.c
- disk-io.c
- file.c
- inode.c
- reflink.c
- relocation.c

And furthermore, about the static inline abuse, the part really need
that static inline is the check against regular sector size, and
unfortunately, most outside callers need such check.

I can put the pure subpage callers into subpage.c, but the generic
helpers handling both cases still need that.

Thanks,
Qu


Re: [PATCH v4 04/18] btrfs: make attach_extent_buffer_page() to handle subpage case

2021-01-19 Thread Qu Wenruo



On 2021/1/20 上午5:54, Josef Bacik wrote:

On 1/16/21 2:15 AM, Qu Wenruo wrote:

For subpage case, we need to allocate new memory for each metadata page.

So we need to:
- Allow attach_extent_buffer_page() to return int
   To indicate allocation failure

- Prealloc btrfs_subpage structure for alloc_extent_buffer()
   We don't want to call memory allocation with spinlock hold, so
   do preallocation before we acquire mapping->private_lock.

- Handle subpage and regular case differently in
   attach_extent_buffer_page()
   For regular case, just do the usual thing.
   For subpage case, allocate new memory or use the preallocated memory.

For future subpage metadata, we will make more usage of radix tree to
grab extnet buffer.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 75 ++--
  fs/btrfs/subpage.h   | 17 ++
  2 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a816ba4a8537..320731487ac0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,7 @@
  #include "rcu-string.h"
  #include "backref.h"
  #include "disk-io.h"
+#include "subpage.h"
  static struct kmem_cache *extent_state_cache;
  static struct kmem_cache *extent_buffer_cache;
@@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
  return ret;
  }
-static void attach_extent_buffer_page(struct extent_buffer *eb,
-  struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+  struct page *page,
+  struct btrfs_subpage *prealloc)
  {
+    struct btrfs_fs_info *fs_info = eb->fs_info;
+    int ret;


int ret = 0;


+
  /*
   * If the page is mapped to btree inode, we should hold the private
   * lock to prevent race.
@@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
extent_buffer *eb,

  if (page->mapping)
  lockdep_assert_held(&page->mapping->private_lock);
-    if (!PagePrivate(page))
-    attach_page_private(page, eb);
-    else
-    WARN_ON(page->private != (unsigned long)eb);
+    if (fs_info->sectorsize == PAGE_SIZE) {
+    if (!PagePrivate(page))
+    attach_page_private(page, eb);
+    else
+    WARN_ON(page->private != (unsigned long)eb);
+    return 0;
+    }
+
+    /* Already mapped, just free prealloc */
+    if (PagePrivate(page)) {
+    kfree(prealloc);
+    return 0;
+    }
+
+    if (prealloc) {
+    /* Has preallocated memory for subpage */
+    spin_lock_init(&prealloc->lock);
+    attach_page_private(page, prealloc);
+    } else {
+    /* Do new allocation to attach subpage */
+    ret = btrfs_attach_subpage(fs_info, page);
+    if (ret < 0)
+    return ret;


Delete the above 2 lines.


+    }
+
+    return 0;


return ret;


  }
  void set_page_extent_mapped(struct page *page)
@@ -5062,21 +5089,29 @@ struct extent_buffer 
*btrfs_clone_extent_buffer(const struct extent_buffer *src)

  if (new == NULL)
  return NULL;
+    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+


Why are you doing this here?  It seems unrelated?  Looking at the code 
it appears there's a reason for this later, but I had to go look to make 
sure I wasn't crazy, so at the very least it needs to be done in a more 
relevant patch.


This is to handle case where we allocated a page but failed to allocate 
subpage structure.


In that case, btrfs_release_extent_buffer() will go different routine to 
free the eb.


Without UNMAPPED bit, it just go wrong without knowing it's a unmapped eb.

This change is mostly due to the extra failure pattern introduced by the 
subpage memory allocation.





  for (i = 0; i < num_pages; i++) {
+    int ret;
+
  p = alloc_page(GFP_NOFS);
  if (!p) {
  btrfs_release_extent_buffer(new);
  return NULL;
  }
-    attach_extent_buffer_page(new, p);
+    ret = attach_extent_buffer_page(new, p, NULL);
+    if (ret < 0) {
+    put_page(p);
+    btrfs_release_extent_buffer(new);
+    return NULL;
+    }
  WARN_ON(PageDirty(p));
  SetPageUptodate(p);
  new->pages[i] = p;
  copy_page(page_address(p), page_address(src->pages[i]));
  }
-    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
-    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
  return new;
  }
@@ -5308,12 +5343,28 @@ struct extent_buffer 
*alloc_extent_buffer(struct btrfs_fs_info *fs_info,

  num_pages = num_extent_pages(eb);
  for (i = 0; i < num_pages; i++, index++) {
+    struct btrfs_subpage *prealloc = NULL;
+
  p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
  if (!p) {
  exists = ERR_PTR(-ENOMEM);
  goto free_eb;
  }
+    /*
+ * Preallo

Re: [PATCH v3 1/4] btrfs: add read_policy latency

2021-01-19 Thread Anand Jain

On 20/1/21 3:36 am, Josef Bacik wrote:

On 1/11/21 4:41 AM, Anand Jain wrote:

The read policy type latency routes the read IO based on the historical
average wait-time experienced by the read IOs through the individual
device. This patch obtains the historical read IO stats from the kernel
block layer and calculates its average.

Example usage:
  echo "latency" > /sys/fs/btrfs/$uuid/read_policy

Signed-off-by: Anand Jain 
---
v3: The block layer commit 0d02129e76ed (block: merge struct 
block_device and

 struct hd_struct) has changed the first argument in the function
 part_stat_read_all() in 5.11-rc1. So the compilation will fail. 
This patch

 fixes it.
 Commit log updated.

v2: Use btrfs_debug_rl() instead of btrfs_info_rl()
 It is better we have this debug until we test this on at least few
 hardwares.
 Drop the unrelated changes.
 Update change log.

v1: Drop part_stat_read_all instead use part_stat_read
 Drop inflight


  fs/btrfs/sysfs.c   |  3 ++-
  fs/btrfs/volumes.c | 38 ++
  fs/btrfs/volumes.h |  2 ++
  3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 19b9fffa2c9c..96ca7bef6357 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -915,7 +915,8 @@ static bool strmatch(const char *buffer, const 
char *string)

  return false;
  }
-static const char * const btrfs_read_policy_name[] = { "pid" };
+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", 
"latency" };

  static ssize_t btrfs_read_policy_show(struct kobject *kobj,
    struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4037a6bd926..f7a0a83d2cd4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
  #include 
  #include 
  #include 
+#include 
  #include "misc.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -5490,6 +5491,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)

  return ret;
  }
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
+  struct map_lookup *map, int first,
+  int num_stripe)
+{
+    u64 est_wait = 0;
+    int best_stripe = 0;
+    int index;
+
+    for (index = first; index < first + num_stripe; index++) {
+    u64 read_wait;
+    u64 avg_wait = 0;
+    unsigned long read_ios;
+    struct btrfs_device *device = map->stripes[index].dev;
+
+    read_wait = part_stat_read(device->bdev, nsecs[READ]);
+    read_ios = part_stat_read(device->bdev, ios[READ]);
+
+    if (read_wait && read_ios && read_wait >= read_ios)
+    avg_wait = div_u64(read_wait, read_ios);
+    else
+    btrfs_debug_rl(device->fs_devices->fs_info,


You can just use fs_info here, you already have it.  I'm not in love 
with doing this check every time, I'd rather cache the results 
somewhere.  However if we're read-only I can't think of a mechanism we 
could piggy back on, RW we could just do it every transaction commit.  
Fix the fs_info thing and you can add




 Oh. I will fix it. Thanks for the review here and in other patches.
-Anand


Reviewed-by: Josef Bacik 

Thanks,

Josef




Re: [PATCH RFC 4/4] btrfs: introduce new read_policy round-robin

2021-01-19 Thread Anand Jain

On 20/1/21 3:41 am, Josef Bacik wrote:

On 1/11/21 4:41 AM, Anand Jain wrote:
Add round-robin read policy to route the read IO to the next device in 
the

round-robin order. The chunk allocation and thus the stripe-index follows
the order of free space available on devices. So to make the round-robin
effective it shall follow the devid order instead of the stripe-index
order.

Signed-off-by: Anand Jain 
--
RFC because: Provides terrible performance with the fio tests.
I am not yet sure if there is any io workload or a block layer
tuning that shall make this policy better. As of now just an
experimental patch.



Just drop this one, if we can't find a reason to use it then don't 
bother adding the code.  The other options have real world valuable 
uses, so stick with those.  Thanks,




 Yep. I will drop this patch in the next iteration.

 The low performance is attributed to the low number of read IO
 mergers in the block layer. The consecutive blocks in my test case
 (fio random read) were read from the other disk, so the block layer
 lost the opportunity to merge the IOs.

Thanks, Anand


Josef






Re: received uuid not set btrfs send/receive

2021-01-19 Thread Anders Halman
Sorry for answering so late. I compiled a more recent (v5.9) version of 
btrfs-progs first.

The unpacking/receiving took about 2 days on the raspberry.

$ nohup btrfs receive -v -f splitter . &
$ tail -n1 nohup.out
ERROR: short read from stream: expected 49233 read 44997

thanks to your -v suggestions I got a little bit more information.

first I figured the files on the local and remote host are not the same, 
by md5sum.


So I will solve this issue first. Is there any recommended way to 
transfer btrfs subvolumes over an unstable connection?



Am 17.01.21 um 13:07 schrieb Chris Murphy:

On Sun, Jan 17, 2021 at 11:51 AM Anders Halman  wrote:

Hello,

I try to backup my laptop over an unreliable slow internet connection to
a even slower Raspberry Pi.

To bootstrap the backup I used the following:

# local
btrfs send root.send.ro | pigz | split --verbose -d -b 1G
rsync -aHAXxv --numeric-ids --partial --progress -e "ssh -T -o
Compression=no -x" x* remote-host:/mnt/backup/btrfs-backup/

# remote
cat x* > split.gz
pigz -d split.gz
btrfs receive -f split

worked nicely. But I don't understand why the "received uuid" on the
remote site in blank.
I tried it locally with smaller volumes and it worked.

I suggest using -v or -vv on the receive side to dig into why the
receive is failing. Setting the received uuid is one of the last
things performed on receive, so if it's not set it suggests the
receive isn't finished.





Re: [RFC PATCH 00/37] block: introduce bio_init_fields()

2021-01-19 Thread Chaitanya Kulkarni
On 1/18/21 21:06, Chaitanya Kulkarni wrote:
> Hi,
>
> This is a *compile only RFC* which adds a generic helper to initialize
> the various fields of the bio that is repeated all the places in
> file-systems, block layer, and drivers.
>
> The new helper allows callers to initialize various members such as
> bdev, sector, private, end io callback, io priority, and write hints.
>
> The objective of this RFC is to only start a discussion, this it not 
> completely tested at all. 
>
> Following diff shows code level benefits of this helper :-
>  38 files changed, 124 insertions(+), 236 deletions(-)
>
> -ck
Thanks for replying Mike, Josef and Christoph.

I'll move forward with Christoph's suggestion and get rid of
optional parameters which is making this API hard to use.


[PATCH v4 1/3] btrfs: add read_policy latency

2021-01-19 Thread Anand Jain
The read policy type latency routes the read IO based on the historical
average wait-time experienced by the read IOs through the individual
device. This patch obtains the historical read IO stats from the kernel
block layer and calculates its average.

Example usage:
 echo "latency" > /sys/fs/btrfs/$uuid/read_policy

Signed-off-by: Anand Jain 
Reviewed-by: Josef Bacik 
---
v4: For btrfs_debug_rl() use fs_info instead of
device->fs_devices->fs_info.

v3: The block layer commit 0d02129e76ed (block: merge struct block_device and
struct hd_struct) has changed the first argument in the function
part_stat_read_all() in 5.11-rc1. So the compilation will fail. This patch
fixes it.
Commit log updated.

v2: Use btrfs_debug_rl() instead of btrfs_info_rl()
It is better we have this debug until we test this on at least few
hardwares.
Drop the unrelated changes.
Update change log.

rfc->v1: Drop part_stat_read_all instead use part_stat_read
Drop inflight

 fs/btrfs/sysfs.c   |  3 ++-
 fs/btrfs/volumes.c | 38 ++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4522a1c4cd08..7c0324fe97b2 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -915,7 +915,8 @@ static bool strmatch(const char *buffer, const char *string)
return false;
 }
 
-static const char * const btrfs_read_policy_name[] = { "pid" };
+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", "latency" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
  struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 62d6a890fc50..f361f1c87eb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -5490,6 +5491,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
return ret;
 }
 
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first,
+ int num_stripe)
+{
+   u64 est_wait = 0;
+   int best_stripe = 0;
+   int index;
+
+   for (index = first; index < first + num_stripe; index++) {
+   u64 read_wait;
+   u64 avg_wait = 0;
+   unsigned long read_ios;
+   struct btrfs_device *device = map->stripes[index].dev;
+
+   read_wait = part_stat_read(device->bdev, nsecs[READ]);
+   read_ios = part_stat_read(device->bdev, ios[READ]);
+
+   if (read_wait && read_ios && read_wait >= read_ios)
+   avg_wait = div_u64(read_wait, read_ios);
+   else
+   btrfs_debug_rl(fs_info,
+   "devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
+  device->devid, read_wait, read_ios);
+
+   if (est_wait == 0 || est_wait > avg_wait) {
+   est_wait = avg_wait;
+   best_stripe = index;
+   }
+   }
+
+   return best_stripe;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
@@ -5519,6 +5553,10 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+   case BTRFS_READ_POLICY_LATENCY:
+   preferred_mirror = btrfs_find_best_stripe(fs_info, map, first,
+ num_stripes);
+   break;
}
 
if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1997a4649a66..71ba1f0e93f4 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -222,6 +222,8 @@ enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
+   /* Find and use device with the lowest latency */
+   BTRFS_READ_POLICY_LATENCY,
BTRFS_NR_READ_POLICY,
 };
 
-- 
2.28.0



[PATCH v4 0/3] btrfs: read_policy types latency, device and round-robin

2021-01-19 Thread Anand Jain
v4:
Add rb from Josef in patch 1 and 3.
In patch 1/3, use fs_info instead of device->fs_devices->fs_info.
Drop round-robin policy because my workload (fio random) shows no performance
 gains due to fewer merges at the block layer.

v3:
The block layer commit 0d02129e76ed (block: merge struct block_device and
struct hd_struct) has changed the first argument in the function
part_stat_read_all() in 5.11-rc1. So trickle down its changes in the patch 1/4.

v2:
Fixes as per review comments, as in the individual patches.

rfc->v1:
Drop the tracing patch.
Drop the factor associated with the inflight commands (because there
were too many unnecessary switches).
Few C styles fix.

-

This patchset adds read policy types latency, device, and round-robin, for the
mirrored raid profiles such as raid1, raid1c3, raid1c4, and raid10. The default
read policy remains as PID, as of now.

Read policy types:
Latency:

Latency policy routes the read IO based on the historical average
wait time experienced by the read IOs on the individual device.

Device:

With the device policy along with the read_preferred flag, you can
set the device for reading manually. Useful to test mirrors in a
deterministic way and helps advance system administrations.

Round-robin (RFC patch):

Alternates striped device in a round-robin loop for reading. To achieve
this first we put the stripes in an array, sort it by devid and pick the
next device.

Test scripts:
=

I have included a few scripts which were useful for testing.

---8<
Set latency policy on the btrfs mounted at /mnt

Usage example:
  $ readpolicyset /mnt latency

Anand Jain (3):
  btrfs: add read_policy latency
  btrfs: introduce new device-state read_preferred
  btrfs: introduce new read_policy device

 fs/btrfs/sysfs.c   | 57 ++-
 fs/btrfs/volumes.c | 60 ++
 fs/btrfs/volumes.h |  5 
 3 files changed, 121 insertions(+), 1 deletion(-)

-- 
2.28.0



[PATCH v4 3/3] btrfs: introduce new read_policy device

2021-01-19 Thread Anand Jain
Read-policy type 'device' and device flag 'read-preferred':

The read-policy type device picks the device(s) flagged as
read-preferred for reading stripes of type raid1, raid10,
raid1c3 and raid1c4.

A system might contain SSD, nvme, iscsi, or san lun, and which are all
a non-rotational device, so it is not a good idea to set the read-preferred
automatically. Instead, device read-policy along with the read-preferred
flag provides an ability to do it manually. This advanced tuning is useful
in more than one situation, for example,
 - In heterogeneous-disk volume, it provides an ability to manually choose
the low latency disks for reading.
 - Useful for more accurate testing.
 - Avoid known problematic device from reading the chunk until it is
   replaced (by marking the other good devices as read-preferred).

Note:

If the read-policy type is set to 'device', but there isn't any device
which is flagged as read-preferred, then stripe 0 is used for reading.

The device replacement won't migrate the read-preferred flag to the new
replace the target device.

As of now, this is an in-memory only feature.

It's pointless to set the read-preferred flag on the missing device, as
IOs aren't submitted to the missing device.

If there is more than one read-preferred device in a chunk, the read IO
shall go to the stripe 0 as of now.

Usage example:

Consider a typical two disks raid1.

Configure devid1 for reading.

$ echo 1 > devinfo/1/read_preferred
$ cat devinfo/1/read_preferred
1
$ cat devinfo/2/read_preferred
0

$ pwd
/sys/fs/btrfs/12345678-1234-1234-1234-123456789abc

$ cat read_policy
[pid] device
$ echo device > ./read_policy
$ cat read_policy
pid [device]

Now read IOs are sent to devid 1 (sdb).

$ echo 3 > /proc/sys/vm/drop_caches
$ md5sum /btrfs/YkZI

$ iostat -zy 1 | egrep 'sdb|sdc' (from another terminal)
sdb  50.00 40048.00 0.00  40048  0

Change the read-preferred device from devid 1 to devid 2 (sdc).

$ echo 0 > ./devinfo/1/read_preferred

[ 3343.918658] BTRFS info (device sdb): reset read preferred on devid 1 (1334)

$ echo 1 > ./devinfo/2/read_preferred

[ 3343.919876] BTRFS info (device sdb): set read preferred on devid 2 (1334)

$ echo 3 > /proc/sys/vm/drop_caches
$ md5sum /btrfs/YkZI

Further read ios are sent to devid 2 (sdc).

$ iostat -zy 1 | egrep 'sdb|sdc' (from another terminal)
sdc  49.00 40048.00 0.00  40048  0

Signed-off-by: Anand Jain 
Reviewed-by: Josef Bacik 
---
v4: add Josef rb.
v3: update the change log.
v2: -
rfc->v1: -

 fs/btrfs/sysfs.c   |  3 ++-
 fs/btrfs/volumes.c | 22 ++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5888e15e3d14..dd1835a2a7ab 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -916,7 +916,8 @@ static bool strmatch(const char *buffer, const char *string)
 }
 
 /* Must follow the order as in enum btrfs_read_policy */
-static const char * const btrfs_read_policy_name[] = { "pid", "latency" };
+static const char * const btrfs_read_policy_name[] = { "pid", "latency",
+  "device" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
  struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f361f1c87eb6..d942260f8d2c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5524,6 +5524,25 @@ static int btrfs_find_best_stripe(struct btrfs_fs_info 
*fs_info,
return best_stripe;
 }
 
+static int btrfs_find_read_preferred(struct map_lookup *map, int first, int 
num_stripe)
+{
+   int stripe_index;
+   int last = first + num_stripe;
+
+   /*
+* If there are more than one read preferred devices, then just pick the
+* first found read preferred device as of now.
+*/
+   for (stripe_index = first; stripe_index < last; stripe_index++) {
+   if (test_bit(BTRFS_DEV_STATE_READ_PREFERRED,
+&map->stripes[stripe_index].dev->dev_state))
+   return stripe_index;
+   }
+
+   /* If there is no read preferred device then just use the first stripe 
*/
+   return first;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
@@ -5557,6 +5576,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
preferred_mirror = btrfs_find_best_stripe(fs_info, map, first,
  num_stripes);
break;
+   case BTRFS_READ_POLICY_DEVICE:
+   preferred_mirror = btrfs_find_read_preferred(map, first, 
num_stripes);
+   break;
}
 
if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volum

[PATCH v4 2/3] btrfs: introduce new device-state read_preferred

2021-01-19 Thread Anand Jain
This is a preparatory patch and introduces a new device flag
'read_preferred', RW-able using sysfs interface.

Signed-off-by: Anand Jain 
---
v4: -
v2: C style fixes. Drop space in between '! test_bit' and extra lines
after it.

 fs/btrfs/sysfs.c   | 53 ++
 fs/btrfs/volumes.h |  1 +
 2 files changed, 54 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7c0324fe97b2..5888e15e3d14 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1422,11 +1422,64 @@ static ssize_t btrfs_devinfo_writeable_show(struct 
kobject *kobj,
 }
 BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
 
+static ssize_t btrfs_devinfo_read_pref_show(struct kobject *kobj,
+   struct kobj_attribute *a, char *buf)
+{
+   int val;
+   struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+  devid_kobj);
+
+   val = !!test_bit(BTRFS_DEV_STATE_READ_PREFERRED, &device->dev_state);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t btrfs_devinfo_read_pref_store(struct kobject *kobj,
+struct kobj_attribute *a,
+const char *buf, size_t len)
+{
+   int ret;
+   unsigned long val;
+   struct btrfs_device *device;
+
+   ret = kstrtoul(skip_spaces(buf), 0, &val);
+   if (ret)
+   return ret;
+
+   if (val != 0 && val != 1)
+   return -EINVAL;
+
+   /*
+* lock is not required, the btrfs_device struct can't be freed while
+* its kobject btrfs_device::devid_kobj is still open.
+*/
+   device = container_of(kobj, struct btrfs_device, devid_kobj);
+
+   if (val &&
+   !test_bit(BTRFS_DEV_STATE_READ_PREFERRED, &device->dev_state)) {
+   set_bit(BTRFS_DEV_STATE_READ_PREFERRED, &device->dev_state);
+   btrfs_info(device->fs_devices->fs_info,
+  "set read preferred on devid %llu (%d)",
+  device->devid, task_pid_nr(current));
+   } else if (!val &&
+  test_bit(BTRFS_DEV_STATE_READ_PREFERRED, 
&device->dev_state)) {
+   clear_bit(BTRFS_DEV_STATE_READ_PREFERRED, &device->dev_state);
+   btrfs_info(device->fs_devices->fs_info,
+  "reset read preferred on devid %llu (%d)",
+  device->devid, task_pid_nr(current));
+   }
+
+   return len;
+}
+BTRFS_ATTR_RW(devid, read_preferred, btrfs_devinfo_read_pref_show,
+ btrfs_devinfo_read_pref_store);
+
 static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
BTRFS_ATTR_PTR(devid, writeable),
+   BTRFS_ATTR_PTR(devid, read_preferred),
NULL
 };
 ATTRIBUTE_GROUPS(devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 71ba1f0e93f4..ea786864b903 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -51,6 +51,7 @@ struct btrfs_io_geometry {
 #define BTRFS_DEV_STATE_REPLACE_TGT(3)
 #define BTRFS_DEV_STATE_FLUSH_SENT (4)
 #define BTRFS_DEV_STATE_NO_READA   (5)
+#define BTRFS_DEV_STATE_READ_PREFERRED (6)
 
 struct btrfs_zoned_device_info;
 
-- 
2.28.0