date:20180518

[PATCH 10/10] block: Add sysfs entry for fua support

2018-05-18 Thread Kent Overstreet

Signed-off-by: Kent Overstreet 
---
 block/blk-sysfs.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cbea895a55..d6dd7d8198 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -497,6 +497,11 @@ static ssize_t queue_wc_store(struct request_queue *q, 
const char *page,
return count;
 }
 
+static ssize_t queue_fua_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
+}
+
 static ssize_t queue_dax_show(struct request_queue *q, char *page)
 {
return queue_var_show(blk_queue_dax(q), page);
@@ -665,6 +670,11 @@ static struct queue_sysfs_entry queue_wc_entry = {
.store = queue_wc_store,
 };
 
+static struct queue_sysfs_entry queue_fua_entry = {
+   .attr = {.name = "fua", .mode = S_IRUGO },
+   .show = queue_fua_show,
+};
+
 static struct queue_sysfs_entry queue_dax_entry = {
.attr = {.name = "dax", .mode = S_IRUGO },
.show = queue_dax_show,
@@ -714,6 +724,7 @@ static struct attribute *default_attrs[] = {
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
+   &queue_fua_entry.attr,
&queue_dax_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
-- 
2.17.0

[PATCH 08/10] block: Add warning for bi_next not NULL in bio_endio()

2018-05-18 Thread Kent Overstreet

Recently found a bug where a driver left bi_next not NULL and then
called bio_endio(), and then the submitter of the bio used
bio_copy_data() which was treating src and dst as lists of bios.

Fixed that bug by splitting out bio_list_copy_data(), but in case other
things are depending on bi_next in weird ways, add a warning to help
avoid more bugs like that in the future.

Signed-off-by: Kent Overstreet 
---
 block/bio.c  | 3 +++
 block/blk-core.c | 8 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index ce8e259f9a..5c81391100 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1775,6 +1775,9 @@ void bio_endio(struct bio *bio)
if (!bio_integrity_endio(bio))
return;
 
+   if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
+   bio->bi_next = NULL;
+
/*
 * Need to have a real endio function for chained bios, otherwise
 * various corner cases will break (like stacking block devices that
diff --git a/block/blk-core.c b/block/blk-core.c
index 66f24798ef..f3cf79198a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -204,6 +204,10 @@ static void req_bio_endio(struct request *rq, struct bio 
*bio,
bio_advance(bio, nbytes);
 
/* don't actually finish bio if it's part of flush sequence */
+   /*
+* XXX this code looks suspicious - it's not consistent with advancing
+* req->bio in caller
+*/
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
 }
@@ -2982,8 +2986,10 @@ bool blk_update_request(struct request *req, 
blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-   if (bio_bytes == bio->bi_iter.bi_size)
+   if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+   bio->bi_next = NULL;
+   }
 
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-- 
2.17.0

[PATCH 09/10] block: Export bio check/set pages_dirty

2018-05-18 Thread Kent Overstreet

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 5c81391100..6689102f5d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1610,6 +1610,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(page);
}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 static void bio_release_pages(struct bio *bio)
 {
@@ -1693,6 +1694,7 @@ void bio_check_pages_dirty(struct bio *bio)
bio_put(bio);
}
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 void generic_start_io_acct(struct request_queue *q, int rw,
   unsigned long sectors, struct hd_struct *part)
-- 
2.17.0

[PATCH 07/10] block: Add missing flush_dcache_page() call

2018-05-18 Thread Kent Overstreet

Since a bio can point to userspace pages (e.g. direct IO), this is
generally necessary.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index c58544d4bc..ce8e259f9a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -994,6 +994,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter 
*dst_iter,
kunmap_atomic(dst_p);
kunmap_atomic(src_p);
 
+   flush_dcache_page(dst_bv.bv_page);
+
bio_advance_iter(src, src_iter, bytes);
bio_advance_iter(dst, dst_iter, bytes);
}
-- 
2.17.0

[PATCH 06/10] block: Split out bio_list_copy_data()

2018-05-18 Thread Kent Overstreet

Found a bug (with ASAN) where we were passing a bio to bio_copy_data()
with bi_next not NULL, when it should have been - a driver had left
bi_next set to something after calling bio_endio().

Since the normal case is only copying single bios, split out
bio_list_copy_data() to avoid more bugs like this in the future.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 83 +
 drivers/block/pktcdvd.c |  2 +-
 include/linux/bio.h |  5 ++-
 3 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index d7bd765e9e..c58544d4bc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -971,32 +971,16 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
-   struct bio *src, struct bvec_iter src_iter)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+   struct bio *src, struct bvec_iter *src_iter)
 {
struct bio_vec src_bv, dst_bv;
void *src_p, *dst_p;
unsigned bytes;
 
-   while (1) {
-   if (!src_iter.bi_size) {
-   src = src->bi_next;
-   if (!src)
-   break;
-
-   src_iter = src->bi_iter;
-   }
-
-   if (!dst_iter.bi_size) {
-   dst = dst->bi_next;
-   if (!dst)
-   break;
-
-   dst_iter = dst->bi_iter;
-   }
-
-   src_bv = bio_iter_iovec(src, src_iter);
-   dst_bv = bio_iter_iovec(dst, dst_iter);
+   while (src_iter->bi_size && dst_iter->bi_size) {
+   src_bv = bio_iter_iovec(src, *src_iter);
+   dst_bv = bio_iter_iovec(dst, *dst_iter);
 
bytes = min(src_bv.bv_len, dst_bv.bv_len);
 
@@ -1010,31 +994,66 @@ void bio_copy_data_iter(struct bio *dst, struct 
bvec_iter dst_iter,
kunmap_atomic(dst_p);
kunmap_atomic(src_p);
 
-   bio_advance_iter(src, &src_iter, bytes);
-   bio_advance_iter(dst, &dst_iter, bytes);
+   bio_advance_iter(src, src_iter, bytes);
+   bio_advance_iter(dst, dst_iter, bytes);
}
 }
 EXPORT_SYMBOL(bio_copy_data_iter);
 
 /**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
  *
  * Stops when it reaches the end of either @src or @dst - that is, copies
  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-   bio_copy_data_iter(dst, dst->bi_iter,
-  src, src->bi_iter);
+   struct bvec_iter src_iter = src->bi_iter;
+   struct bvec_iter dst_iter = dst->bi_iter;
+
+   bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 }
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that 
is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+   struct bvec_iter src_iter = src->bi_iter;
+   struct bvec_iter dst_iter = dst->bi_iter;
+
+   while (1) {
+   if (!src_iter.bi_size) {
+   src = src->bi_next;
+   if (!src)
+   break;
+
+   src_iter = src->bi_iter;
+   }
+
+   if (!dst_iter.bi_size) {
+   dst = dst->bi_next;
+   if (!dst)
+   break;
+
+   dst_iter = dst->bi_iter;
+   }
+
+   bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+   }
+}
+EXPORT_SYMBOL(bio_list_copy_data);
+
 struct bio_map_data {
int is_our_pages;
struct iov_iter iter;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c61d20c9f3..00ea788b17 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, 
struct packet_data *pkt)
 * Fill-in bvec with data from orig_bios.
 */
spin_lock(&pkt->lock);
-   bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+   bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
pkt_set_state(pkt, P

[PATCH 05/10] block: Add bio_copy_data_iter(), zero_fill_bio_iter()

2018-05-18 Thread Kent Overstreet

Add versions that take bvec_iter args instead of using bio->bi_iter - to
be used by bcachefs.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 44 
 include/linux/bio.h | 18 +++---
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b7cdad6fc4..d7bd765e9e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -530,20 +530,20 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int 
nr_iovecs,
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
 
-   bio_for_each_segment(bv, bio, iter) {
+   __bio_for_each_segment(bv, bio, iter, start) {
char *data = bvec_kmap_irq(&bv, &flags);
memset(data, 0, bv.bv_len);
flush_dcache_page(bv.bv_page);
bvec_kunmap_irq(data, &flags);
}
 }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
 
 /**
  * bio_put - release a reference to a bio
@@ -971,28 +971,13 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+   struct bio *src, struct bvec_iter src_iter)
 {
-   struct bvec_iter src_iter, dst_iter;
struct bio_vec src_bv, dst_bv;
void *src_p, *dst_p;
unsigned bytes;
 
-   src_iter = src->bi_iter;
-   dst_iter = dst->bi_iter;
-
while (1) {
if (!src_iter.bi_size) {
src = src->bi_next;
@@ -1029,6 +1014,25 @@ void bio_copy_data(struct bio *dst, struct bio *src)
bio_advance_iter(dst, &dst_iter, bytes);
}
 }
+EXPORT_SYMBOL(bio_copy_data_iter);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+   bio_copy_data_iter(dst, dst->bi_iter,
+  src, src->bi_iter);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 91b02520e2..5a6ee955a8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,8 +67,12 @@
 
 #define bio_multiple_segments(bio) \
((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
-#define bio_sectors(bio)   ((bio)->bi_iter.bi_size >> 9)
-#define bio_end_sector(bio)((bio)->bi_iter.bi_sector + bio_sectors((bio)))
+
+#define bvec_iter_sectors(iter)((iter).bi_size >> 9)
+#define bvec_iter_end_sector(iter) ((iter).bi_sector + 
bvec_iter_sectors((iter)))
+
+#define bio_sectors(bio)   bvec_iter_sectors((bio)->bi_iter)
+#define bio_end_sector(bio)bvec_iter_end_sector((bio)->bi_iter)
 
 /*
  * Return the data direction, READ or WRITE.
@@ -501,6 +505,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
+  struct bio *src, struct bvec_iter src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 
@@ -509,7 +515,13 @@ extern struct bio *bio_copy_user_iov(struct request_queue 
*,
 struct iov_iter *,
 gfp_t);
 extern int bio_uncopy_user(struct bio *);
-void zero_fill_bio(struct bio *bio);
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
+
+static inline void zero_fill_bio(struct bio *bio)
+{
+   zero_fill_bio_iter(bio, bio->bi_iter);
+}
+
 extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
 extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
-- 
2.17.0

[PATCH 04/10] block: Use bioset_init() for fs_bio_set

2018-05-18 Thread Kent Overstreet

Minor optimization - remove a pointer indirection when using fs_bio_set.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 7 +++
 block/blk-core.c| 2 +-
 drivers/target/target_core_iblock.c | 2 +-
 include/linux/bio.h | 4 ++--
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 980befd919..b7cdad6fc4 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] 
__read_mostly = {
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-struct bio_set *fs_bio_set;
+struct bio_set fs_bio_set;
 EXPORT_SYMBOL(fs_bio_set);
 
 /*
@@ -2055,11 +2055,10 @@ static int __init init_bio(void)
bio_integrity_init();
biovec_init_slabs();
 
-   fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-   if (!fs_bio_set)
+   if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
 
-   if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+   if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
panic("bio: can't create integrity pool\n");
 
return 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 6d82c4f7fa..66f24798ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3409,7 +3409,7 @@ int blk_rq_prep_clone(struct request *rq, struct request 
*rq_src,
struct bio *bio, *bio_src;
 
if (!bs)
-   bs = fs_bio_set;
+   bs = &fs_bio_set;
 
__rq_for_each_bio(bio_src, rq_src) {
bio = bio_clone_fast(bio_src, gfp_mask, bs);
diff --git a/drivers/target/target_core_iblock.c 
b/drivers/target/target_core_iblock.c
index 07c814c426..c969c01c7c 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -164,7 +164,7 @@ static int iblock_configure_device(struct se_device *dev)
goto out_blkdev_put;
}
pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: 
%p\n",
-bs->bio_integrity_pool);
+&bs->bio_integrity_pool);
}
dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type;
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fa3cf94a50..91b02520e2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -423,11 +423,11 @@ extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
-extern struct bio_set *fs_bio_set;
+extern struct bio_set fs_bio_set;
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-   return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+   return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
 static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-- 
2.17.0

[PATCH 03/10] block: Add bioset_init()/bioset_exit()

2018-05-18 Thread Kent Overstreet

Similarly to mempool_init()/mempool_exit(), take a pointer indirection
out of allocation/freeing by allowing biosets to be embedded in other
structs.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 93 +++--
 include/linux/bio.h |  2 +
 2 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 360e9bcea5..980befd919 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1856,21 +1856,83 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
 
-void bioset_free(struct bio_set *bs)
+/*
+ * bioset_exit - exit a bioset initialized with bioset_init()
+ *
+ * May be called on a zeroed but uninitialized bioset (i.e. allocated with
+ * kzalloc()).
+ */
+void bioset_exit(struct bio_set *bs)
 {
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
+   bs->rescue_workqueue = NULL;
 
mempool_exit(&bs->bio_pool);
mempool_exit(&bs->bvec_pool);
 
bioset_integrity_free(bs);
-   bio_put_slab(bs);
+   if (bs->bio_slab)
+   bio_put_slab(bs);
+   bs->bio_slab = NULL;
+}
+EXPORT_SYMBOL(bioset_exit);
 
+void bioset_free(struct bio_set *bs)
+{
+   bioset_exit(bs);
kfree(bs);
 }
 EXPORT_SYMBOL(bioset_free);
 
+/**
+ * bioset_init - Initialize a bio_set
+ * @pool_size: Number of bio and bio_vecs to cache in the mempool
+ * @front_pad: Number of bytes to allocate in front of the returned bio
+ * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
+ *  and %BIOSET_NEED_RESCUER
+ *
+ * Similar to bioset_create(), but initializes a passed-in bioset instead of
+ * separately allocating it.
+ */
+int bioset_init(struct bio_set *bs,
+   unsigned int pool_size,
+   unsigned int front_pad,
+   int flags)
+{
+   unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+
+   bs->front_pad = front_pad;
+
+   spin_lock_init(&bs->rescue_lock);
+   bio_list_init(&bs->rescue_list);
+   INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
+   bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+   if (!bs->bio_slab)
+   return -ENOMEM;
+
+   if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
+   goto bad;
+
+   if ((flags & BIOSET_NEED_BVECS) &&
+   biovec_init_pool(&bs->bvec_pool, pool_size))
+   goto bad;
+
+   if (!(flags & BIOSET_NEED_RESCUER))
+   return 0;
+
+   bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+   if (!bs->rescue_workqueue)
+   goto bad;
+
+   return 0;
+bad:
+   bioset_exit(bs);
+   return -ENOMEM;
+}
+EXPORT_SYMBOL(bioset_init);
+
 /**
  * bioset_create  - Create a bio_set
  * @pool_size: Number of bio and bio_vecs to cache in the mempool
@@ -1895,43 +1957,18 @@ struct bio_set *bioset_create(unsigned int pool_size,
  unsigned int front_pad,
  int flags)
 {
-   unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
struct bio_set *bs;
 
bs = kzalloc(sizeof(*bs), GFP_KERNEL);
if (!bs)
return NULL;
 
-   bs->front_pad = front_pad;
-
-   spin_lock_init(&bs->rescue_lock);
-   bio_list_init(&bs->rescue_list);
-   INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
-   bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-   if (!bs->bio_slab) {
+   if (bioset_init(bs, pool_size, front_pad, flags)) {
kfree(bs);
return NULL;
}
 
-   if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
-   goto bad;
-
-   if ((flags & BIOSET_NEED_BVECS) &&
-   biovec_init_pool(&bs->bvec_pool, pool_size))
-   goto bad;
-
-   if (!(flags & BIOSET_NEED_RESCUER))
-   return bs;
-
-   bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
-   if (!bs->rescue_workqueue)
-   goto bad;
-
return bs;
-bad:
-   bioset_free(bs);
-   return NULL;
 }
 EXPORT_SYMBOL(bioset_create);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 720f7261d0..fa3cf94a50 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -406,6 +406,8 @@ static inline struct bio *bio_next_split(struct bio *bio, 
int sectors,
return bio_split(bio, sectors, gfp, bs);
 }
 
+extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int 
flags);
+extern void bioset_exit(struct bio_set *);
 extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags);
 enum {
BIOSET_NEED_BVECS = BIT(0),
-- 
2.17.0

[PATCH 01/10] mempool: Add mempool_init()/mempool_exit()

2018-05-18 Thread Kent Overstreet

Allows mempools to be embedded in other structs, getting rid of a
pointer indirection from allocation fastpaths.

mempool_exit() is safe to call on an uninitialized but zeroed mempool.

Signed-off-by: Kent Overstreet 
---
 include/linux/mempool.h |  34 +
 mm/mempool.c| 108 ++--
 2 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index b51f5c430c..0c964ac107 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -25,6 +25,18 @@ typedef struct mempool_s {
wait_queue_head_t wait;
 } mempool_t;
 
+static inline bool mempool_initialized(mempool_t *pool)
+{
+   return pool->elements != NULL;
+}
+
+void mempool_exit(mempool_t *pool);
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id);
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+mempool_free_t *free_fn, void *pool_data);
+
 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
@@ -43,6 +55,14 @@ extern void mempool_free(void *element, mempool_t *pool);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
+
+static inline int
+mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
+{
+   return mempool_init(pool, min_nr, mempool_alloc_slab,
+   mempool_free_slab, (void *) kc);
+}
+
 static inline mempool_t *
 mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 {
@@ -56,6 +76,13 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
+
+static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, 
size_t size)
+{
+   return mempool_init(pool, min_nr, mempool_kmalloc,
+   mempool_kfree, (void *) size);
+}
+
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
@@ -68,6 +95,13 @@ static inline mempool_t *mempool_create_kmalloc_pool(int 
min_nr, size_t size)
  */
 void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
 void mempool_free_pages(void *element, void *pool_data);
+
+static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int 
order)
+{
+   return mempool_init(pool, min_nr, mempool_alloc_pages,
+   mempool_free_pages, (void *)(long)order);
+}
+
 static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
 {
return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
diff --git a/mm/mempool.c b/mm/mempool.c
index 5c9dce3471..df90ace400 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -137,6 +137,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
return element;
 }
 
+/**
+ * mempool_destroy - exit a mempool initialized with mempool_init()
+ * @pool:  pointer to the memory pool which was initialized with
+ * mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+   while (pool->curr_nr) {
+   void *element = remove_element(pool, GFP_KERNEL);
+   pool->free(element, pool->pool_data);
+   }
+   kfree(pool->elements);
+   pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
 /**
  * mempool_destroy - deallocate a memory pool
  * @pool:  pointer to the memory pool which was allocated via
@@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool)
if (unlikely(!pool))
return;
 
-   while (pool->curr_nr) {
-   void *element = remove_element(pool, GFP_KERNEL);
-   pool->free(element, pool->pool_data);
-   }
-   kfree(pool->elements);
+   mempool_exit(pool);
kfree(pool);
 }
 EXPORT_SYMBOL(mempool_destroy);
 
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
+{
+   spin_lock_init(&pool->lock);
+   pool->min_nr= min_nr;
+   pool->pool_data = pool_data;
+   pool->alloc = alloc_fn;
+   pool->free  = free_fn;
+   init_waitqueue_head(&pool->wait);
+
+   pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+   gfp_mas

[PATCH 02/10] block: Convert bio_set to mempool_init()

2018-05-18 Thread Kent Overstreet

Minor performance improvement by getting rid of pointer indirections
from allocation/freeing fastpaths.

Signed-off-by: Kent Overstreet 
---
 block/bio-integrity.c | 29 ++---
 block/bio.c   | 36 +---
 include/linux/bio.h   | 10 +-
 3 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9cfdd6c83b..add7c7c853 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct 
bio *bio,
struct bio_set *bs = bio->bi_pool;
unsigned inline_vecs;
 
-   if (!bs || !bs->bio_integrity_pool) {
+   if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
bip = kmalloc(sizeof(struct bio_integrity_payload) +
  sizeof(struct bio_vec) * nr_vecs, gfp_mask);
inline_vecs = nr_vecs;
} else {
-   bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+   bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
inline_vecs = BIP_INLINE_VECS;
}
 
@@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio 
*bio,
unsigned long idx = 0;
 
bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- bs->bvec_integrity_pool);
+ &bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
bip->bip_max_vcnt = bvec_nr_vecs(idx);
@@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio 
*bio,
 
return bip;
 err:
-   mempool_free(bip, bs->bio_integrity_pool);
+   mempool_free(bip, &bs->bio_integrity_pool);
return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
@@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio)
kfree(page_address(bip->bip_vec->bv_page) +
  bip->bip_vec->bv_offset);
 
-   if (bs && bs->bio_integrity_pool) {
-   bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
+   if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+   bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, 
bip->bip_slab);
 
-   mempool_free(bip, bs->bio_integrity_pool);
+   mempool_free(bip, &bs->bio_integrity_pool);
} else {
kfree(bip);
}
@@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-   if (bs->bio_integrity_pool)
+   if (mempool_initialized(&bs->bio_integrity_pool))
return 0;
 
-   bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-   if (!bs->bio_integrity_pool)
+   if (mempool_init_slab_pool(&bs->bio_integrity_pool,
+  pool_size, bip_slab))
return -1;
 
-   bs->bvec_integrity_pool = biovec_create_pool(pool_size);
-   if (!bs->bvec_integrity_pool) {
-   mempool_destroy(bs->bio_integrity_pool);
+   if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
+   mempool_exit(&bs->bio_integrity_pool);
return -1;
}
 
@@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-   mempool_destroy(bs->bio_integrity_pool);
-   mempool_destroy(bs->bvec_integrity_pool);
+   mempool_exit(&bs->bio_integrity_pool);
+   mempool_exit(&bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
diff --git a/block/bio.c b/block/bio.c
index e1708db482..360e9bcea5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -254,7 +254,7 @@ static void bio_free(struct bio *bio)
bio_uninit(bio);
 
if (bs) {
-   bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+   bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
 
/*
 * If we have front padding, adjust the bio pointer before 
freeing
@@ -262,7 +262,7 @@ static void bio_free(struct bio *bio)
p = bio;
p -= bs->front_pad;
 
-   mempool_free(p, bs->bio_pool);
+   mempool_free(p, &bs->bio_pool);
} else {
/* Bio was allocated by bio_kmalloc() */
kfree(bio);
@@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int 
nr_iovecs,
inline_vecs = nr_iovecs;
} else {
/* should not use nobvec bioset for nr_iovecs > 0 */
-   if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
+   if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+nr_iovecs > 0))
return NULL;

Re: [PATCH 02/10] block: Convert bio_set to mempool_init()

2018-05-18 Thread Johannes Thumshirn

On Fri, May 18, 2018 at 03:49:01AM -0400, Kent Overstreet wrote:
> Minor performance improvement by getting rid of pointer indirections
> from allocation/freeing fastpaths.

Reviewed-by: Johannes Thumshirn 

Although I'd prefer numbers in the changelog when claiming a
performance improvement.

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 01/10] mempool: Add mempool_init()/mempool_exit()

2018-05-18 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 03/10] block: Add bioset_init()/bioset_exit()

2018-05-18 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 08/10] block: Add warning for bi_next not NULL in bio_endio()

2018-05-18 Thread Johannes Thumshirn

Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Re: [PATCH 00/10] Misc block layer patches for bcachefs

2018-05-18 Thread Kent Overstreet

On Thu, May 17, 2018 at 08:54:57PM +, Bart Van Assche wrote:
> On Tue, 2018-05-08 at 21:33 -0400, Kent Overstreet wrote:
> > [ ... ]
> 
> Hello Kent,
> 
> With Jens' latest for-next branch I hit the kernel warning shown below. Can
> you have a look?

Any hints on how to reproduce it?

> Thanks,
> 
> Bart.
> 
> 
> ==
> BUG: KASAN: use-after-free in bio_advance+0x110/0x1b0
> Read of size 4 at addr 880156c5e6d0 by task ksoftirqd/10/72
> 
> CPU: 10 PID: 72 Comm: ksoftirqd/10 Tainted: GW 
> 4.17.0-rc4-dbg+ #5
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.0.0-prebuilt.qemu-project.org 04/01/2014
> Call Trace:
> dump_stack+0x9a/0xeb
> print_address_description+0x65/0x270
> kasan_report+0x232/0x350
> bio_advance+0x110/0x1b0
> blk_update_request+0x9d/0x5a0
> scsi_end_request+0x4c/0x300 [scsi_mod]
> scsi_io_completion+0x71e/0xa40 [scsi_mod]
> __blk_mq_complete_request+0x143/0x220
> srp_recv_done+0x454/0x1100 [ib_srp]
> __ib_process_cq+0x9a/0xf0 [ib_core]
> ib_poll_handler+0x2d/0x90 [ib_core]
> irq_poll_softirq+0xe5/0x1e0
> __do_softirq+0x112/0x5f0
> run_ksoftirqd+0x29/0x50
> smpboot_thread_fn+0x30f/0x410
> kthread+0x1b2/0x1d0
> ret_from_fork+0x24/0x30
> 
> Allocated by task 1356:
> kasan_kmalloc+0xa0/0xd0
> kmem_cache_alloc+0xed/0x320
> mempool_alloc+0xc6/0x210
> bio_alloc_bioset+0x128/0x2d0
> submit_bh_wbc+0x95/0x2d0
> __block_write_full_page+0x2a6/0x5c0
> __writepage+0x37/0x80
> write_cache_pages+0x305/0x7c0
> generic_writepages+0xb9/0x110
> do_writepages+0x96/0x180
> __filemap_fdatawrite_range+0x162/0x1b0
> file_write_and_wait_range+0x4d/0xb0
> blkdev_fsync+0x3c/0x70
> do_fsync+0x33/0x60
> __x64_sys_fsync+0x18/0x20
> do_syscall_64+0x6d/0x220
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Freed by task 72:
> __kasan_slab_free+0x130/0x180
> kmem_cache_free+0xcd/0x380
> blk_update_request+0xc4/0x5a0
> blk_update_request+0xc4/0x5a0
> scsi_end_request+0x4c/0x300 [scsi_mod]
> scsi_io_completion+0x71e/0xa40 [scsi_mod]
> __blk_mq_complete_request+0x143/0x220
> srp_recv_done+0x454/0x1100 [ib_srp]
> __ib_process_cq+0x9a/0xf0 [ib_core]
> ib_poll_handler+0x2d/0x90 [ib_core]
> irq_poll_softirq+0xe5/0x1e0
> __do_softirq+0x112/0x5f0
> 
> The buggy address belongs to the object at 880156c5e640
> which belongs to the cache bio-0 of size 200
> The buggy address is located 144 bytes inside of
> 200-byte region [880156c5e640, 880156c5e708)
> The buggy address belongs to the page:
> page:ea00055b1780 count:1 mapcount:0 mapping: index:0x0 
> compound_mapcount: 0
> ib_srpt:srpt_zerolength_write: ib_srpt 10.196.159.179-24: queued zerolength 
> write
> flags: 0x80008100(slab|head)
> raw: 80008100   000100190019
> raw: ea000543a800 00020002 88015a8f3a00 
> ib_srpt:srpt_zerolength_write: ib_srpt 10.196.159.179-22: queued zerolength 
> write
> page dumped because: kasan: bad access detected
> ib_srpt:srpt_zerolength_write: ib_srpt 10.196.159.179-20: queued zerolength 
> write
> 
> Memory state around the buggy address:
> ib_srpt:srpt_zerolength_write: ib_srpt 10.196.159.179-18: queued zerolength 
> write
> 880156c5e580: 00 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc
> ib_srpt:srpt_zerolength_write_done: ib_srpt 10.196.159.179-24 wc->status 5
> 880156c5e600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
> ib_srpt:srpt_zerolength_write_done: ib_srpt 10.196.159.179-22 wc->status 5
> >880156c5e680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ib_srpt:srpt_zerolength_write_done: ib_srpt 10.196.159.179-20 wc->status 5
> ^
> 880156c5e700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> ib_srpt:srpt_zerolength_write_done: ib_srpt 10.196.159.179-18 wc->status 5
> 880156c5e780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ib_srpt:srpt_release_channel_work: ib_srpt 10.196.159.179-24
> ==
> 
> (gdb) list *(bio_advance+0x110)
> 0x81450090 is in bio_advance (./include/linux/bvec.h:82).
> 77  iter->bi_size = 0;
> 78  return false;
> 79  }
> 80
> 81  while (bytes) {
> 82  unsigned iter_len = bvec_iter_len(bv, *iter);
> 83  unsigned len = min(bytes, iter_len);
> 84
> 85  bytes -= len;
> 86  iter->bi_size -= len;
> 
> 
> 
> 
> 
>

Re: [PATCH] Bsg referencing parent device

2018-05-18 Thread Christoph Hellwig

The idea looks pretty reasonable, but once that is done we can get rid of
the ->release callback entirely and just handle it in the callers.
Something like the untested patch below:

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fc2e5ff2c4b9..9419def8c017 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct 
request *req)
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- * @release: @dev release function
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-   bsg_job_fn *job_fn, int dd_job_size,
-   void (*release)(struct device *))
+   bsg_job_fn *job_fn, int dd_job_size)
 {
struct request_queue *q;
int ret;
@@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, 
const char *name,
blk_queue_softirq_done(q, bsg_softirq_done);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
-   ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
+   ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
if (ret) {
printk(KERN_ERR "%s: bsg interface failed to "
   "initialize - register queue\n", dev->kobj.name);
diff --git a/block/bsg.c b/block/bsg.c
index defa06c11858..fe1e5632e5d1 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -650,18 +650,6 @@ static struct bsg_device *bsg_alloc_device(void)
return bd;
 }
 
-static void bsg_kref_release_function(struct kref *kref)
-{
-   struct bsg_class_device *bcd =
-   container_of(kref, struct bsg_class_device, ref);
-   struct device *parent = bcd->parent;
-
-   if (bcd->release)
-   bcd->release(bcd->parent);
-
-   put_device(parent);
-}
-
 static int bsg_put_device(struct bsg_device *bd)
 {
int ret = 0, do_free;
@@ -694,7 +682,6 @@ static int bsg_put_device(struct bsg_device *bd)
 
kfree(bd);
 out:
-   kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
if (do_free)
blk_put_queue(q);
return ret;
@@ -760,8 +747,6 @@ static struct bsg_device *bsg_get_device(struct inode 
*inode, struct file *file)
 */
mutex_lock(&bsg_mutex);
bcd = idr_find(&bsg_minor_idr, iminor(inode));
-   if (bcd)
-   kref_get(&bcd->ref);
mutex_unlock(&bsg_mutex);
 
if (!bcd)
@@ -772,8 +757,6 @@ static struct bsg_device *bsg_get_device(struct inode 
*inode, struct file *file)
return bd;
 
bd = bsg_add_device(inode, bcd->queue, file);
-   if (IS_ERR(bd))
-   kref_put(&bcd->ref, bsg_kref_release_function);
 
return bd;
 }
@@ -913,25 +896,17 @@ void bsg_unregister_queue(struct request_queue *q)
sysfs_remove_link(&q->kobj, "bsg");
device_unregister(bcd->class_dev);
bcd->class_dev = NULL;
-   kref_put(&bcd->ref, bsg_kref_release_function);
mutex_unlock(&bsg_mutex);
 }
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-   const char *name, const struct bsg_ops *ops,
-   void (*release)(struct device *))
+   const char *name, const struct bsg_ops *ops)
 {
struct bsg_class_device *bcd;
dev_t dev;
int ret;
struct device *class_dev = NULL;
-   const char *devname;
-
-   if (name)
-   devname = name;
-   else
-   devname = dev_name(parent);
 
/*
 * we need a proper transport to send commands, not a stacked device
@@ -955,15 +930,12 @@ int bsg_register_queue(struct request_queue *q, struct 
device *parent,
 
bcd->minor = ret;
bcd->queue = q;
-   bcd->parent = get_device(parent);
-   bcd->release = release;
bcd->ops = ops;
-   kref_init(&bcd->ref);
dev = MKDEV(bsg_major, bcd->minor);
-   class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
+   class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
if (IS_ERR(class_dev)) {
ret = PTR_ERR(class_dev);
-   goto put_dev;
+   goto idr_remove;
}
bcd->class_dev = class_dev;
 
@@ -978,8 +950,7 @@ int bsg_register_queue(struct request_queue *q, struct 
device *parent,
 
 unregister_class_dev:
device_unregister(class_dev);
-put_dev:
-   put_device(parent);
+idr_remove:
idr_remove(&bsg_minor_idr, bcd->minor);
 unlock:
mutex_unlock(&bsg_mutex);
@@ -993,7 +964,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct 
device *parent)
return -EINVAL;
}
 
-   return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+   return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
 }
 EXPORT_

[PATCH v2 08/26] ibtrs: client: statistics functions

2018-05-18 Thread Roman Pen

This introduces set of functions used on client side to account
statistics of RDMA data sent/received, amount of IOs inflight,
latency, cpu migrations, etc.  Almost all statistics is collected
using percpu variables.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-clt-stats.c | 455 +
 1 file changed, 455 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-clt-stats.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-clt-stats.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-clt-stats.c
new file mode 100644
index ..af2ed05d2900
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-clt-stats.c
@@ -0,0 +1,455 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibtrs-clt.h"
+
+static inline int ibtrs_clt_ms_to_id(unsigned long ms)
+{
+   int id = ms ? ilog2(ms) - MIN_LOG_LAT + 1 : 0;
+
+   return clamp(id, 0, LOG_LAT_SZ - 1);
+}
+
+void ibtrs_clt_update_rdma_lat(struct ibtrs_clt_stats *stats, bool read,
+  unsigned long ms)
+{
+   struct ibtrs_clt_stats_pcpu *s;
+   int id;
+
+   id = ibtrs_clt_ms_to_id(ms);
+   s = this_cpu_ptr(stats->pcpu_stats);
+   if (read) {
+   s->rdma_lat_distr[id].read++;
+   if (s->rdma_lat_max.read < ms)
+   s->rdma_lat_max.read = ms;
+   } else {
+   s->rdma_lat_distr[id].write++;
+   if (s->rdma_lat_max.write < ms)
+   s->rdma_lat_max.write = ms;
+   }
+}
+
+void ibtrs_clt_decrease_inflight(struct ibtrs_clt_stats *stats)
+{
+   atomic_dec(&stats->inflight);
+}
+
+void ibtrs_clt_update_wc_stats(struct ibtrs_clt_con *con)
+{
+   struct ibtrs_clt_sess *sess = to_clt_sess(con->c.sess);
+   struct ibtrs_clt_stats *stats = &sess->stats;
+   struct ibtrs_clt_stats_pcpu *s;
+   int cpu;
+
+   cpu = raw_smp_processor_id();
+   s = this_cpu_ptr(stats->pcpu_stats);
+   s->wc_comp.cnt++;
+   s->wc_comp.total_cnt++;
+   if (unlikely(con->cpu != cpu)) {
+   s->cpu_migr.to++;
+
+   /* Careful here, override s pointer */
+   s = per_cpu_ptr(stats->pcpu_stats, con->cpu);
+   atomic_inc(&s->cpu_migr.from);
+   }
+}
+
+void ibtrs_clt_inc_failover_cnt(struct ibtrs_clt_stats *stats)
+{
+   struct ibtrs_clt_stats_pcpu *s;
+
+   s = this_cpu_ptr(stats->pcpu_stats);
+   s->rdma.failover_cnt++;
+}
+
+static inline u32 ibtrs_clt_stats_get_avg_wc_cnt(struct ibtrs_clt_stats *stats)
+{
+   u32 cnt = 0;
+   u64 sum = 0;
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   struct ibtrs_clt_stats_pcpu *s;
+
+   s = per_cpu_ptr(stats->pcpu_stats, cpu);
+   sum += s->wc_comp.total_cnt;
+   cnt += s->wc_comp.cnt;
+   }
+
+   return cnt ? sum / cnt : 0;
+}
+
+int ibtrs_clt_stats_wc_completion_to_str(struct ibtrs_clt_stats *stats,
+char *buf, size_t len)
+{
+   return scnprintf(buf, len, "%u\n",
+ibtrs_clt_stats_get_avg_wc_cnt(stats));
+}
+
+ssize_t ibtrs_clt_stats_rdma_lat_distr_to_str(struct ibtrs_clt_stats *stats,
+ char *page, size_t len)
+{
+   struct ibtrs_clt_stats_rdma_lat res[LOG_LAT_SZ];
+   struct ibtrs_clt_stats_rdma_lat max;
+   struct ibtrs_clt_stats_pcpu *s;
+
+   ssize_t cnt = 0;
+   int i, cpu;
+
+   max.write = 0;
+   max.read = 0;
+   for_each_possible_cpu(cpu) {
+   s = per_cpu_ptr(stats->pcpu_stats, cpu);
+
+   if (max.write < s->rdma_lat_max.write)
+   max.write = s->rdma_lat_max.write;
+   if (max.read < s->rdma_lat_max.read)
+   max.read = s->rdma_lat_ma

[PATCH v2 07/26] ibtrs: client: main functionality

2018-05-18 Thread Roman Pen

This is main functionality of ibtrs-client module, which manages
set of RDMA connections for each IBTRS session, does multipathing,
load balancing and failover of RDMA requests.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-clt.c | 2818 ++
 1 file changed, 2818 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-clt.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-clt.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-clt.c
new file mode 100644
index ..0983f0939b19
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-clt.c
@@ -0,0 +1,2818 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+
+#include "ibtrs-clt.h"
+#include "ibtrs-log.h"
+
+#define MAX_SEGMENTS 31
+#define IBTRS_CONNECT_TIMEOUT_MS 5000
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_DESCRIPTION("IBTRS Client");
+MODULE_VERSION(IBTRS_VER_STRING);
+MODULE_LICENSE("GPL");
+
+static ushort nr_cons_per_session;
+module_param(nr_cons_per_session, ushort, 0444);
+MODULE_PARM_DESC(nr_cons_per_session, "Number of connections per session."
+" (default: nr_cpu_ids)");
+
+static int retry_cnt = 7;
+module_param_named(retry_cnt, retry_cnt, int, 0644);
+MODULE_PARM_DESC(retry_cnt, "Number of times to send the message if the"
+" remote side didn't respond with Ack or Nack (default: 7,"
+" min: " __stringify(MIN_RTR_CNT) ", max: "
+__stringify(MAX_RTR_CNT) ")");
+
+static int __read_mostly noreg_cnt = 0;
+module_param_named(noreg_cnt, noreg_cnt, int, 0444);
+MODULE_PARM_DESC(noreg_cnt, "Max number of SG entries when MR registration "
+"does not happen (default: 0)");
+
+static const struct ibtrs_ib_dev_pool_ops dev_pool_ops;
+static struct ibtrs_ib_dev_pool dev_pool = {
+   .ops = &dev_pool_ops
+};
+static struct workqueue_struct *ibtrs_wq;
+static struct class *ibtrs_dev_class;
+
+static void ibtrs_rdma_error_recovery(struct ibtrs_clt_con *con);
+static int ibtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
+struct rdma_cm_event *ev);
+static void ibtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc);
+static void complete_rdma_req(struct ibtrs_clt_io_req *req, int errno,
+ bool notify, bool can_wait);
+static int ibtrs_clt_write_req(struct ibtrs_clt_io_req *req);
+static int ibtrs_clt_read_req(struct ibtrs_clt_io_req *req);
+
+bool ibtrs_clt_sess_is_connected(const struct ibtrs_clt_sess *sess)
+{
+   return sess->state == IBTRS_CLT_CONNECTED;
+}
+
+static inline bool ibtrs_clt_is_connected(const struct ibtrs_clt *clt)
+{
+   struct ibtrs_clt_sess *sess;
+   bool connected = false;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(sess, &clt->paths_list, s.entry)
+   connected |= ibtrs_clt_sess_is_connected(sess);
+   rcu_read_unlock();
+
+   return connected;
+}
+
+static inline struct ibtrs_tag *
+__ibtrs_get_tag(struct ibtrs_clt *clt, enum ibtrs_clt_con_type con_type)
+{
+   size_t max_depth = clt->queue_depth;
+   struct ibtrs_tag *tag;
+   int cpu, bit;
+
+   cpu = get_cpu();
+   do {
+   bit = find_first_zero_bit(clt->tags_map, max_depth);
+   if (unlikely(bit >= max_depth)) {
+   put_cpu();
+   return NULL;
+   }
+
+   } while (unlikely(test_and_set_bit_lock(bit, clt->tags_map)));
+   put_cpu();
+
+   tag = GET_TAG(clt, bit);
+   WARN_ON(tag->mem_id != bit);
+   tag->cpu_id = cpu;
+   tag->con_type = con_type;
+
+   return tag;
+}
+
+static inline void __ibtrs_put_tag(struct ibtrs_clt *clt,
+  struct ibtrs_tag *tag)
+{
+   clear_bit_unlock(tag->mem

[PATCH v2 10/26] ibtrs: server: private header with server structs and functions

2018-05-18 Thread Roman Pen

This header describes main structs and functions used by ibtrs-server
module, mainly for accepting IBTRS sessions, creating/destroying
sysfs entries, accounting statistics on server side.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-srv.h | 175 +++
 1 file changed, 175 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-srv.h

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv.h 
b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.h
new file mode 100644
index ..8193d568e67e
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.h
@@ -0,0 +1,175 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBTRS_SRV_H
+#define IBTRS_SRV_H
+
+#include 
+#include 
+#include "ibtrs-pri.h"
+
+/**
+ * enum ibtrs_srv_state - Server states.
+ */
+enum ibtrs_srv_state {
+   IBTRS_SRV_CONNECTING,
+   IBTRS_SRV_CONNECTED,
+   IBTRS_SRV_CLOSING,
+   IBTRS_SRV_CLOSED,
+};
+
+static inline const char *ibtrs_srv_state_str(enum ibtrs_srv_state state)
+{
+   switch (state) {
+   case IBTRS_SRV_CONNECTING:
+   return "IBTRS_SRV_CONNECTING";
+   case IBTRS_SRV_CONNECTED:
+   return "IBTRS_SRV_CONNECTED";
+   case IBTRS_SRV_CLOSING:
+   return "IBTRS_SRV_CLOSING";
+   case IBTRS_SRV_CLOSED:
+   return "IBTRS_SRV_CLOSED";
+   default:
+   return "UNKNOWN";
+   }
+}
+
+struct ibtrs_stats_wc_comp {
+   atomic64_t  calls;
+   atomic64_t  total_wc_cnt;
+};
+
+struct ibtrs_srv_stats_rdma_stats {
+   struct {
+   atomic64_t  cnt;
+   atomic64_t  size_total;
+   } dir[2];
+};
+
+struct ibtrs_srv_stats {
+   struct ibtrs_srv_stats_rdma_stats   rdma_stats;
+   atomic_tapm_cnt;
+   struct ibtrs_stats_wc_comp  wc_comp;
+};
+
+struct ibtrs_srv_con {
+   struct ibtrs_conc;
+   atomic_twr_cnt;
+};
+
+struct ibtrs_srv_op {
+   struct ibtrs_srv_con*con;
+   u32 msg_id;
+   u8  dir;
+   struct ibtrs_msg_rdma_read  *rd_msg;
+   struct ib_rdma_wr   *tx_wr;
+   struct ib_sge   *tx_sg;
+};
+
+struct ibtrs_srv_mr {
+   struct ib_mr*mr;
+   struct sg_table sgt;
+};
+
+struct ibtrs_srv_sess {
+   struct ibtrs_sess   s;
+   struct ibtrs_srv*srv;
+   struct work_struct  close_work;
+   enum ibtrs_srv_statestate;
+   spinlock_t  state_lock;
+   int cur_cq_vector;
+   struct ibtrs_srv_op **ops_ids;
+   atomic_tids_inflight;
+   wait_queue_head_t   ids_waitq;
+   struct ibtrs_srv_mr *mrs;
+   unsigned intmrs_num;
+   dma_addr_t  *dma_addr;
+   boolestablished;
+   unsigned intmem_bits;
+   struct kobject  kobj;
+   struct kobject  kobj_stats;
+   struct ibtrs_srv_stats  stats;
+};
+
+struct ibtrs_srv {
+   struct list_headpaths_list;
+   int paths_up;
+   struct mutexpaths_ev_mutex;
+   size_t  paths_num;
+   struct mutexpaths_mutex;
+   uuid_t  paths_uuid;
+   refcount_t  refcount;
+   struct ibtrs_srv_ctx*ctx;
+   struct list_headctx_list;
+   void*priv;
+   size_t  queue_depth;
+   struct page **chunks;
+   struct device   dev;
+   unsigneddev_ref;
+   struct kobject  kobj_paths;
+};
+
+struct ibtrs_srv_ctx {
+   rdma_ev_fn *rdma_ev;
+   link_ev_fn *link_ev;
+

[PATCH v2 13/26] ibtrs: server: sysfs interface functions

2018-05-18 Thread Roman Pen

This is the sysfs interface to IBTRS sessions on server side:

  /sys/devices/virtual/ibtrs-server//
*** IBTRS session accepted from a client peer
|
|- paths//
   *** established paths from a client in a session
   |
   |- disconnect
   |  *** disconnect path
   |
   |- hca_name
   |  *** HCA name
   |
   |- hca_port
   |  *** HCA port
   |
   |- stats/
  *** current path statistics
  |
  |- rdma
  |- reset_all
  |- wc_completions

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-srv-sysfs.c | 271 +
 1 file changed, 271 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-srv-sysfs.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv-sysfs.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-srv-sysfs.c
new file mode 100644
index ..96d9d9f08e0e
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv-sysfs.c
@@ -0,0 +1,271 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibtrs-pri.h"
+#include "ibtrs-srv.h"
+#include "ibtrs-log.h"
+
+extern struct class *ibtrs_dev_class;
+
+static struct kobj_type ktype = {
+   .sysfs_ops  = &kobj_sysfs_ops,
+};
+
+static ssize_t ibtrs_srv_disconnect_show(struct kobject *kobj,
+struct kobj_attribute *attr,
+char *page)
+{
+   return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+attr->attr.name);
+}
+
+static ssize_t ibtrs_srv_disconnect_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct ibtrs_srv_sess *sess;
+   char str[MAXHOSTNAMELEN];
+
+   sess = container_of(kobj, struct ibtrs_srv_sess, kobj);
+   if (!sysfs_streq(buf, "1")) {
+   ibtrs_err(sess, "%s: invalid value: '%s'\n",
+ attr->attr.name, buf);
+   return -EINVAL;
+   }
+
+   sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, str, sizeof(str));
+
+   ibtrs_info(sess, "disconnect for path %s requested\n", str);
+   ibtrs_srv_queue_close(sess);
+
+   return count;
+}
+
+static struct kobj_attribute ibtrs_srv_disconnect_attr =
+   __ATTR(disconnect, 0644,
+  ibtrs_srv_disconnect_show, ibtrs_srv_disconnect_store);
+
+static ssize_t ibtrs_srv_hca_port_show(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  char *page)
+{
+   struct ibtrs_srv_sess *sess;
+   struct ibtrs_con *usr_con;
+
+   sess = container_of(kobj, typeof(*sess), kobj);
+   usr_con = sess->s.con[0];
+
+   return scnprintf(page, PAGE_SIZE, "%u\n",
+usr_con->cm_id->port_num);
+}
+
+static struct kobj_attribute ibtrs_srv_hca_port_attr =
+   __ATTR(hca_port, 0444, ibtrs_srv_hca_port_show, NULL);
+
+static ssize_t ibtrs_srv_hca_name_show(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  char *page)
+{
+   struct ibtrs_srv_sess *sess;
+
+   sess = container_of(kobj, struct ibtrs_srv_sess, kobj);
+
+   return scnprintf(page, PAGE_SIZE, "%s\n",
+sess->s.dev->ib_dev->name);
+}
+
+static struct kobj_attribute ibtrs_srv_hca_name_attr =
+   __ATTR(hca_name, 0444, ibtrs_srv_hca_name_show, NULL);
+
+static struct attribute *ibtrs_srv_sess_attrs[] = {
+   &ibtrs_srv_hca_name_attr.attr,
+   &ibtrs_srv_hca_port_attr.attr,
+   &ibtrs_srv_disconnect_attr.attr,
+   NULL,
+};
+
+static struct attribute_group ibtrs_srv_sess_attr_group = {
+   .attrs = ibtrs_s

[PATCH v2 12/26] ibtrs: server: statistics functions

2018-05-18 Thread Roman Pen

This introduces set of functions used on server side to account
statistics of RDMA data sent/received.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-srv-stats.c | 110 +
 1 file changed, 110 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-srv-stats.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv-stats.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-srv-stats.c
new file mode 100644
index ..5933cfc03f95
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv-stats.c
@@ -0,0 +1,110 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibtrs-srv.h"
+
+void ibtrs_srv_update_rdma_stats(struct ibtrs_srv_stats *s,
+size_t size, int d)
+{
+   atomic64_inc(&s->rdma_stats.dir[d].cnt);
+   atomic64_add(size, &s->rdma_stats.dir[d].size_total);
+}
+
+void ibtrs_srv_update_wc_stats(struct ibtrs_srv_stats *s)
+{
+   atomic64_inc(&s->wc_comp.calls);
+   atomic64_inc(&s->wc_comp.total_wc_cnt);
+}
+
+int ibtrs_srv_reset_rdma_stats(struct ibtrs_srv_stats *stats, bool enable)
+{
+   if (enable) {
+   struct ibtrs_srv_stats_rdma_stats *r = &stats->rdma_stats;
+
+   memset(r, 0, sizeof(*r));
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+ssize_t ibtrs_srv_stats_rdma_to_str(struct ibtrs_srv_stats *stats,
+   char *page, size_t len)
+{
+   struct ibtrs_srv_stats_rdma_stats *r = &stats->rdma_stats;
+   struct ibtrs_srv_sess *sess;
+
+   sess = container_of(stats, typeof(*sess), stats);
+
+   return scnprintf(page, len, "%lld %lld %lld %lld %u\n",
+(s64)atomic64_read(&r->dir[READ].cnt),
+(s64)atomic64_read(&r->dir[READ].size_total),
+(s64)atomic64_read(&r->dir[WRITE].cnt),
+(s64)atomic64_read(&r->dir[WRITE].size_total),
+atomic_read(&sess->ids_inflight));
+}
+
+int ibtrs_srv_reset_wc_completion_stats(struct ibtrs_srv_stats *stats,
+   bool enable)
+{
+   if (enable) {
+   memset(&stats->wc_comp, 0, sizeof(stats->wc_comp));
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+int ibtrs_srv_stats_wc_completion_to_str(struct ibtrs_srv_stats *stats,
+char *buf, size_t len)
+{
+   return snprintf(buf, len, "%lld %lld\n",
+   (s64)atomic64_read(&stats->wc_comp.total_wc_cnt),
+   (s64)atomic64_read(&stats->wc_comp.calls));
+}
+
+ssize_t ibtrs_srv_reset_all_help(struct ibtrs_srv_stats *stats,
+char *page, size_t len)
+{
+   return scnprintf(page, PAGE_SIZE, "echo 1 to reset all statistics\n");
+}
+
+int ibtrs_srv_reset_all_stats(struct ibtrs_srv_stats *stats, bool enable)
+{
+   if (enable) {
+   ibtrs_srv_reset_wc_completion_stats(stats, enable);
+   ibtrs_srv_reset_rdma_stats(stats, enable);
+   return 0;
+   }
+
+   return -EINVAL;
+}
-- 
2.13.1

[PATCH v2 16/26] ibnbd: private headers with IBNBD protocol structs and helpers

2018-05-18 Thread Roman Pen

These are common private headers with IBNBD protocol structures,
logging, sysfs and other helper functions, which are used on
both client and server sides.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-log.h   |  71 
 drivers/block/ibnbd/ibnbd-proto.h | 364 ++
 2 files changed, 435 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-log.h
 create mode 100644 drivers/block/ibnbd/ibnbd-proto.h

diff --git a/drivers/block/ibnbd/ibnbd-log.h b/drivers/block/ibnbd/ibnbd-log.h
new file mode 100644
index ..489343a61171
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-log.h
@@ -0,0 +1,71 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBNBD_LOG_H
+#define IBNBD_LOG_H
+
+#include "ibnbd-clt.h"
+#include "ibnbd-srv.h"
+
+#define ibnbd_diskname(dev) ({ \
+   struct gendisk *gd = ((struct ibnbd_clt_dev *)dev)->gd; \
+   gd ? gd->disk_name : "";\
+})
+
+void unknown_type(void);
+
+#define ibnbd_log(fn, dev, fmt, ...) ({
\
+   __builtin_choose_expr(  \
+   __builtin_types_compatible_p(   \
+   typeof(dev), struct ibnbd_clt_dev *),   \
+   fn("<%s@%s> %s: " fmt, (dev)->pathname, \
+  (dev)->sess->sessname, ibnbd_diskname(dev),  \
+  ##__VA_ARGS__),  \
+   __builtin_choose_expr(  \
+   __builtin_types_compatible_p(typeof(dev),   \
+   struct ibnbd_srv_sess_dev *),   \
+   fn("<%s@%s>: " fmt, (dev)->pathname,\
+  (dev)->sess->sessname, ##__VA_ARGS__),   
\
+   unknown_type()));   \
+})
+
+#define ibnbd_err(dev, fmt, ...)   \
+   ibnbd_log(pr_err, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_err_rl(dev, fmt, ...)\
+   ibnbd_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_wrn(dev, fmt, ...)   \
+   ibnbd_log(pr_warn, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_wrn_rl(dev, fmt, ...) \
+   ibnbd_log(pr_warn_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_info(dev, fmt, ...) \
+   ibnbd_log(pr_info, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_info_rl(dev, fmt, ...) \
+   ibnbd_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__)
+
+#endif /* IBNBD_LOG_H */
diff --git a/drivers/block/ibnbd/ibnbd-proto.h 
b/drivers/block/ibnbd/ibnbd-proto.h
new file mode 100644
index ..050d3fa4c1bf
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-proto.h
@@ -0,0 +1,364 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifnde

[PATCH v2 11/26] ibtrs: server: main functionality

2018-05-18 Thread Roman Pen

This is main functionality of ibtrs-server module, which accepts
set of RDMA connections (so called IBTRS session), creates/destroys
sysfs entries associated with IBTRS session and notifies upper layer
(user of IBTRS API) about RDMA requests or link events.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-srv.c | 1981 ++
 1 file changed, 1981 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-srv.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c
new file mode 100644
index ..d57fa6af5a5c
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c
@@ -0,0 +1,1981 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+
+#include "ibtrs-srv.h"
+#include "ibtrs-log.h"
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_DESCRIPTION("IBTRS Server");
+MODULE_VERSION(IBTRS_VER_STRING);
+MODULE_LICENSE("GPL");
+
+/* Must be power of 2, see mask from mr->page_size in ib_sg_to_pages() */
+#define DEFAULT_MAX_CHUNK_SIZE (128 << 10)
+#define DEFAULT_SESS_QUEUE_DEPTH 512
+#define MAX_HDR_SIZE PAGE_SIZE
+#define MAX_SG_COUNT ((MAX_HDR_SIZE - sizeof(struct ibtrs_msg_rdma_read)) \
+ / sizeof(struct ibtrs_sg_desc))
+
+/* We guarantee to serve 10 paths at least */
+#define CHUNK_POOL_SZ 10
+
+static struct ibtrs_ib_dev_pool dev_pool;
+static mempool_t *chunk_pool;
+struct class *ibtrs_dev_class;
+
+static int retry_count = 7;
+static int __read_mostly max_chunk_size = DEFAULT_MAX_CHUNK_SIZE;
+static int __read_mostly sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH;
+
+module_param_named(max_chunk_size, max_chunk_size, int, 0444);
+MODULE_PARM_DESC(max_chunk_size,
+"Max size for each IO request, when change the unit is in byte"
+" (default: " __stringify(DEFAULT_MAX_CHUNK_SIZE_KB) "KB)");
+
+module_param_named(sess_queue_depth, sess_queue_depth, int, 0444);
+MODULE_PARM_DESC(sess_queue_depth,
+"Number of buffers for pending I/O requests to allocate"
+" per session. Maximum: " __stringify(MAX_SESS_QUEUE_DEPTH)
+" (default: " __stringify(DEFAULT_SESS_QUEUE_DEPTH) ")");
+
+static int retry_count_set(const char *val, const struct kernel_param *kp)
+{
+   int err, ival;
+
+   err = kstrtoint(val, 0, &ival);
+   if (err)
+   return err;
+
+   if (ival < MIN_RTR_CNT || ival > MAX_RTR_CNT) {
+   pr_err("Invalid retry count value %d, has to be"
+  " > %d, < %d\n", ival, MIN_RTR_CNT, MAX_RTR_CNT);
+   return -EINVAL;
+   }
+
+   retry_count = ival;
+   pr_info("QP retry count changed to %d\n", ival);
+
+   return 0;
+}
+
+static const struct kernel_param_ops retry_count_ops = {
+   .set= retry_count_set,
+   .get= param_get_int,
+};
+module_param_cb(retry_count, &retry_count_ops, &retry_count, 0644);
+
+MODULE_PARM_DESC(retry_count, "Number of times to send the message if the"
+" remote side didn't respond with Ack or Nack (default: 3,"
+" min: " __stringify(MIN_RTR_CNT) ", max: "
+__stringify(MAX_RTR_CNT) ")");
+
+static char cq_affinity_list[256] = "";
+static cpumask_t cq_affinity_mask = { CPU_BITS_ALL };
+
+static void init_cq_affinity(void)
+{
+   sprintf(cq_affinity_list, "0-%d", nr_cpu_ids - 1);
+}
+
+static int cq_affinity_list_set(const char *val, const struct kernel_param *kp)
+{
+   int ret = 0, len = strlen(val);
+   cpumask_var_t new_value;
+
+   if (!strlen(cq_affinity_list))
+   init_cq_affinity();
+
+   if (len >= sizeof(cq_affinity_list))
+   return -EINVAL;
+   if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+

[PATCH v2 15/26] ibtrs: a bit of documentation

2018-05-18 Thread Roman Pen

README with description of major sysfs entries.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/README | 358 
 1 file changed, 358 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/README

diff --git a/drivers/infiniband/ulp/ibtrs/README 
b/drivers/infiniband/ulp/ibtrs/README
new file mode 100644
index ..010a93b02d9c
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/README
@@ -0,0 +1,358 @@
+
+InfiniBand Transport (IBTRS)
+
+
+IBTRS (InfiniBand Transport) is a reliable high speed transport library
+which provides support to establish optimal number of connections
+between client and server machines using RDMA (InfiniBand, RoCE, iWarp)
+transport. It is optimized to transfer (read/write) IO blocks.
+
+In its core interface it follows the BIO semantics of providing the
+possibility to either write data from an sg list to the remote side
+or to request ("read") data transfer from the remote side into a given
+sg list.
+
+IBTRS provides I/O fail-over and load-balancing capabilities by using
+multipath I/O (see "add_path" and "mp_policy" configuration entries).
+
+IBTRS is used by the IBNBD (Infiniband Network Block Device) modules.
+
+==
+Client Sysfs Interface
+==
+
+This chapter describes only the most important files of sysfs interface
+on client side.
+
+Entries under /sys/devices/virtual/ibtrs-client/
+
+
+When a user of IBTRS API creates a new session, a directory entry with
+the name of that session is created.
+
+Entries under /sys/devices/virtual/ibtrs-client//
+===
+
+add_path (RW)
+-
+
+Adds a new path (connection) to an existing session. Expected format is the
+following:
+
+  <[source addr,]destination addr>
+
+  *addr ::= [ ip: | gid: ]
+
+max_reconnect_attempts (RW)
+---
+
+Maximum number reconnect attempts the client should make before giving up
+after connection breaks unexpectedly.
+
+mp_policy (RW)
+--
+
+Multipath policy specifies which path should be selected on each IO:
+
+   round-robin (0):
+   select path in per CPU round-robin manner.
+
+   min-inflight (1):
+   select path with minimum inflights.
+
+Entries under /sys/devices/virtual/ibtrs-client//paths/
+=
+
+
+Each path belonging to a given session is listed here by its destination
+address. When a new path is added to a session by writing to the "add_path"
+entry, a directory with the corresponding destination address is created.
+
+Entries under 
/sys/devices/virtual/ibtrs-client//paths//
+=
+
+state (R)
+-
+
+Contains "connected" if the session is connected to the peer and fully
+functional.  Otherwise the file contains "disconnected"
+
+reconnect (RW)
+--
+
+Write "1" to the file in order to reconnect the path.
+Operation is blocking and returns 0 if reconnect was successful.
+
+disconnect (RW)
+---
+
+Write "1" to the file in order to disconnect the path.
+Operation blocks until IBTRS path is disconnected.
+
+remove_path (RW)
+
+
+Write "1" to the file in order to disconnected and remove the path
+from the session.  Operation blocks until the path is disconnected
+and removed from the session.
+
+Entries under 
/sys/devices/virtual/ibtrs-client//paths//stats/
+===
+
+Write "0" to any file in that directory to reset corresponding statistics.
+
+reset_all (RW)
+--
+
+Read will return usage help, write 0 will clear all the statistics.
+
+sg_entries (RW)
+---
+
+Data to be transferred via RDMA is passed to IBTRS as scatter-gather
+list. A scatter-gather list can contain multiple entries.
+Scatter-gather list with less entries require less processing power
+and can therefore transferred faster. The file sg_entries outputs a
+per-CPU distribution table for the number of entries in the
+scatter-gather lists, that were passed to the IBTRS API function
+ibtrs_clt_request (READ or WRITE).
+
+cpu_migration (RW)
+--
+
+IBTRS expects that each HCA IRQ is pinned to a separate CPU. If it's
+not the case, the processing of an I/O response could be processed on a
+different CPU than where it was originally submitted.  This file shows
+how many interrupts where generated on a non expected CPU.
+"from:" is the CPU on which the IRQ was expected, but not generated.
+"to:" is the CPU on which the IRQ was generated, but not expected.
+
+reconnects (RW)
+---
+
+Contains 2 unsigned int values, the first one records number of successful
+reconnects in the p

[PATCH v2 25/26] ibnbd: a bit of documentation

2018-05-18 Thread Roman Pen

README with description of major sysfs entries.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/README | 299 +
 1 file changed, 299 insertions(+)
 create mode 100644 drivers/block/ibnbd/README

diff --git a/drivers/block/ibnbd/README b/drivers/block/ibnbd/README
new file mode 100644
index ..bbaddd02c1c5
--- /dev/null
+++ b/drivers/block/ibnbd/README
@@ -0,0 +1,299 @@
+***
+Infiniband Network Block Device (IBNBD)
+***
+
+Introduction
+
+
+IBNBD (InfiniBand Network Block Device) is a pair of kernel modules
+(client and server) that allow for remote access of a block device on
+the server over IBTRS protocol using the RDMA (InfiniBand, RoCE, iWarp)
+transport. After being mapped, the remote block devices can be accessed
+on the client side as local block devices.
+
+I/O is transfered between client and server by the IBTRS transport
+modules. The administration of IBNBD and IBTRS modules is done via
+sysfs entries.
+
+Requirements
+
+
+  IBTRS kernel modules
+
+Quick Start
+---
+
+Server side:
+  # modprobe ibnbd_server
+
+Client side:
+  # modprobe ibnbd_client
+  # echo "sessname=blya path=ip:10.50.100.66 device_path=/dev/ram0" > \
+/sys/devices/virtual/ibnbd-client/ctl/map_device
+
+  Where "sessname=" is a session name, a string to identify the session
+  on client and on server sides; "path=" is a destination IP address or
+  a pair of a source and a destination IPs, separated by comma.  Multiple
+  "path=" options can be specified in order to use multipath  (see IBTRS
+  description for details); "device_path=" is the block device to be
+  mapped from the server side. After the session to the server machine is
+  established, the mapped device will appear on the client side under
+  /dev/ibnbd.
+
+
+==
+Client Sysfs Interface
+==
+
+All sysfs files that are not read-only provide the usage information on read:
+
+Example:
+  # cat /sys/devices/virtual/ibnbd-client/ctl/map_device
+
+  > Usage: echo "sessname= path=<[srcaddr,]dstaddr>
+  > [path=<[srcaddr,]dstaddr>] device_path=
+  > [access_mode=]
+  > [io_mode=]" > map_device
+  >
+  > addr ::= [ ip: | ip: | gid: ]
+
+Entries under /sys/devices/virtual/ibnbd-client/ctl/
+===
+
+map_device (RW)
+---
+
+Expected format is the following:
+
+sessname=
+path=<[srcaddr,]dstaddr> [path=<[srcaddr,]dstaddr> ...]
+device_path=
+[access_mode=]
+[io_mode=]
+
+Where:
+
+sessname: accepts a string not bigger than 256 chars, which identifies
+  a given session on the client and on the server.
+  I.e. "clt_hostname-srv_hostname" could be a natural choice.
+
+path: describes a connection between the client and the server by
+  specifying destination and, when required, the source address.
+  The addresses are to be provided in the following format:
+
+ip:
+ip:
+gid:
+
+  for example:
+
+  path=ip:10.0.0.66
+ The single addr is treated as the destination.
+ The connection will be established to this
+ server from any client IP address.
+
+  path=ip:10.0.0.66,ip:10.0.1.66
+ First addr is the source address and the second
+ is the destination.
+
+  If multiple "path=" options are specified multiple connection
+  will be established and data will be sent according to
+  the selected multipath policy (see IBTRS mp_policy sysfs entry
+  description).
+
+device_path: Path to the block device on the server side. Path is specified
+ relative to the directory on server side configured in the
+ 'dev_search_path' module parameter of the ibnbd_server.
+ The ibnbd_server prepends the  received from client
+ with  and tries to open the
+ / block device.  On success,
+ a /dev/ibnbd device file, a /sys/block/ibnbd_client/ibnbd/
+ directory and an entry in 
/sys/devices/virtual/ibnbd-client/ctl/devices
+ will be created.
+
+ If 'dev_search_path' contains '%SESSNAME%', then each session can
+ have different devices namespace, e.g. server was configured with
+ the following parameter "dev_search_path=/run/ibnbd-devs/%SESSNAME%",
+ client has this string "sessname=blya device_path=sda", then server
+ will try to open: /run/ibnbd-devs/blya/sda.
+
+access_mode: the access_mode parameter specifies if the device is to be
+ mapped as "ro" read-only or "rw" read-write. The server allows
+ a device to be exported in rw mode only once. The "migration"
+ access mode has to be specified if a second mapping i

[PATCH v2 19/26] ibnbd: client: sysfs interface functions

2018-05-18 Thread Roman Pen

This is the sysfs interface to IBNBD block devices on client side:

  /sys/devices/virtual/ibnbd-client/ctl/
|- map_device
|  *** maps remote device
|
|- devices/
   *** all mapped devices

  /sys/block/ibnbd/ibnbd_client/
|- unmap_device
|  *** unmaps device
|
|- state
|  *** device state
|
|- session
|  *** session name
|
|- mapping_path
   *** path of the dev that was mapped on server

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-clt-sysfs.c | 675 ++
 1 file changed, 675 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt-sysfs.c

diff --git a/drivers/block/ibnbd/ibnbd-clt-sysfs.c 
b/drivers/block/ibnbd/ibnbd-clt-sysfs.c
new file mode 100644
index ..ca3e59b28c54
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt-sysfs.c
@@ -0,0 +1,675 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ibnbd-clt.h"
+
+static struct device *ibnbd_dev;
+static struct class *ibnbd_dev_class;
+static struct kobject *ibnbd_devs_kobj;
+
+enum {
+   IBNBD_OPT_ERR   = 0,
+   IBNBD_OPT_PATH  = 1 << 0,
+   IBNBD_OPT_DEV_PATH  = 1 << 1,
+   IBNBD_OPT_ACCESS_MODE   = 1 << 3,
+   IBNBD_OPT_IO_MODE   = 1 << 5,
+   IBNBD_OPT_SESSNAME  = 1 << 6,
+};
+
+static unsigned int ibnbd_opt_mandatory[] = {
+   IBNBD_OPT_PATH,
+   IBNBD_OPT_DEV_PATH,
+   IBNBD_OPT_SESSNAME,
+};
+
+static const match_table_t ibnbd_opt_tokens = {
+   {   IBNBD_OPT_PATH, "path=%s"   },
+   {   IBNBD_OPT_DEV_PATH, "device_path=%s"},
+   {   IBNBD_OPT_ACCESS_MODE,  "access_mode=%s"},
+   {   IBNBD_OPT_IO_MODE,  "io_mode=%s"},
+   {   IBNBD_OPT_SESSNAME, "sessname=%s"   },
+   {   IBNBD_OPT_ERR,  NULL},
+};
+
+/* remove new line from string */
+static void strip(char *s)
+{
+   char *p = s;
+
+   while (*s != '\0') {
+   if (*s != '\n')
+   *p++ = *s++;
+   else
+   ++s;
+   }
+   *p = '\0';
+}
+
+static int ibnbd_clt_parse_map_options(const char *buf,
+  char *sessname,
+  struct ibtrs_addr *paths,
+  size_t *path_cnt,
+  size_t max_path_cnt,
+  char *pathname,
+  enum ibnbd_access_mode *access_mode,
+  enum ibnbd_io_mode *io_mode)
+{
+   char *options, *sep_opt;
+   char *p;
+   substring_t args[MAX_OPT_ARGS];
+   int opt_mask = 0;
+   int token;
+   int ret = -EINVAL;
+   int i;
+   int p_cnt = 0;
+
+   options = kstrdup(buf, GFP_KERNEL);
+   if (!options)
+   return -ENOMEM;
+
+   sep_opt = strstrip(options);
+   strip(sep_opt);
+   while ((p = strsep(&sep_opt, " ")) != NULL) {
+   if (!*p)
+   continue;
+
+   token = match_token(p, ibnbd_opt_tokens, args);
+   opt_mask |= token;
+
+   switch (token) {
+   case IBNBD_OPT_SESSNAME:
+   p = match_strdup(args);
+   if (!p) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   if (strlen(p) > NAME_MAX) {
+   pr_err("map_device: sessname too long\n");
+   r

[PATCH v2 17/26] ibnbd: client: private header with client structs and functions

2018-05-18 Thread Roman Pen

This header describes main structs and functions used by ibnbd-client
module, mainly for managing IBNBD sessions and mapped block devices,
creating and destroying sysfs entries.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-clt.h | 172 
 1 file changed, 172 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt.h

diff --git a/drivers/block/ibnbd/ibnbd-clt.h b/drivers/block/ibnbd/ibnbd-clt.h
new file mode 100644
index ..c5f6f08ec338
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt.h
@@ -0,0 +1,172 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBNBD_CLT_H
+#define IBNBD_CLT_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ibtrs.h"
+#include "ibnbd-proto.h"
+#include "ibnbd-log.h"
+
+#define BMAX_SEGMENTS 31
+#define RECONNECT_DELAY 30
+#define MAX_RECONNECTS -1
+
+enum ibnbd_clt_dev_state {
+   DEV_STATE_INIT,
+   DEV_STATE_MAPPED,
+   DEV_STATE_MAPPED_DISCONNECTED,
+   DEV_STATE_UNMAPPED,
+};
+
+struct ibnbd_iu_comp {
+   wait_queue_head_t wait;
+   int errno;
+};
+
+struct ibnbd_iu {
+   union {
+   struct request *rq; /* for block io */
+   void *buf; /* for user messages */
+   };
+   struct ibtrs_tag*tag;
+   union {
+   /* use to send msg associated with a dev */
+   struct ibnbd_clt_dev *dev;
+   /* use to send msg associated with a sess */
+   struct ibnbd_clt_session *sess;
+   };
+   blk_status_tstatus;
+   struct scatterlist  sglist[BMAX_SEGMENTS];
+   struct work_struct  work;
+   int errno;
+   struct ibnbd_iu_comp*comp;
+};
+
+struct ibnbd_cpu_qlist {
+   struct list_headrequeue_list;
+   spinlock_t  requeue_lock;
+   unsigned intcpu;
+};
+
+struct ibnbd_clt_session {
+   struct list_headlist;
+   struct ibtrs_clt*ibtrs;
+   wait_queue_head_t   ibtrs_waitq;
+   boolibtrs_ready;
+   struct ibnbd_cpu_qlist  __percpu
+   *cpu_queues;
+   DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
+   int __percpu*cpu_rr; /* per-cpu var for CPU round-robin */
+   atomic_tbusy;
+   int queue_depth;
+   u32 max_io_size;
+   struct blk_mq_tag_set   tag_set;
+   struct mutexlock; /* protects state and devs_list */
+   struct list_headdevs_list; /* list of struct ibnbd_clt_dev */
+   refcount_t  refcount;
+   charsessname[NAME_MAX];
+   u8  ver; /* protocol version */
+};
+
+/**
+ * Submission queues.
+ */
+struct ibnbd_queue {
+   struct list_headrequeue_list;
+   unsigned long   in_list;
+   struct ibnbd_clt_dev*dev;
+   struct blk_mq_hw_ctx*hctx;
+};
+
+struct ibnbd_clt_dev {
+   struct ibnbd_clt_session*sess;
+   struct request_queue*queue;
+   struct ibnbd_queue  *hw_queues;
+   u32 device_id;
+   /* local Idr index - used to track minor number allocations. */
+   u32 clt_device_id;
+   struct mutexlock;
+   enum ibnbd_clt_dev_statedev_state;
+   enum ibnbd_io_mode  io_mode; /* user requested */
+   enum ibnbd_io_mode  remote_io_mode; /* server really used */
+   charpathname[NAME_MAX];
+   enum ibnbd_access_mode  access_mode;
+   boolread_only;
+   boolrotational;
+   u32 max_hw_sectors;
+   u32 max_write_same_sectors;
+   u32 max

[PATCH v2 14/26] ibtrs: include client and server modules into kernel compilation

2018-05-18 Thread Roman Pen

Add IBTRS Makefile, Kconfig and also corresponding lines into upper
layer infiniband/ulp files.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/Kconfig|  1 +
 drivers/infiniband/ulp/Makefile   |  1 +
 drivers/infiniband/ulp/ibtrs/Kconfig  | 20 
 drivers/infiniband/ulp/ibtrs/Makefile | 15 +++
 4 files changed, 37 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/Kconfig
 create mode 100644 drivers/infiniband/ulp/ibtrs/Makefile

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index ee270e065ba9..787bd286fb08 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -94,6 +94,7 @@ source "drivers/infiniband/ulp/srpt/Kconfig"
 
 source "drivers/infiniband/ulp/iser/Kconfig"
 source "drivers/infiniband/ulp/isert/Kconfig"
+source "drivers/infiniband/ulp/ibtrs/Kconfig"
 
 source "drivers/infiniband/ulp/opa_vnic/Kconfig"
 source "drivers/infiniband/sw/rdmavt/Kconfig"
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
index 437813c7b481..1c4f10dc8d49 100644
--- a/drivers/infiniband/ulp/Makefile
+++ b/drivers/infiniband/ulp/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT)   += srpt/
 obj-$(CONFIG_INFINIBAND_ISER)  += iser/
 obj-$(CONFIG_INFINIBAND_ISERT) += isert/
 obj-$(CONFIG_INFINIBAND_OPA_VNIC)  += opa_vnic/
+obj-$(CONFIG_INFINIBAND_IBTRS) += ibtrs/
diff --git a/drivers/infiniband/ulp/ibtrs/Kconfig 
b/drivers/infiniband/ulp/ibtrs/Kconfig
new file mode 100644
index ..eaeb8f3f6b4e
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/Kconfig
@@ -0,0 +1,20 @@
+config INFINIBAND_IBTRS
+   tristate
+   depends on INFINIBAND_ADDR_TRANS
+
+config INFINIBAND_IBTRS_CLIENT
+   tristate "IBTRS client module"
+   depends on INFINIBAND_ADDR_TRANS
+   select INFINIBAND_IBTRS
+   help
+ IBTRS client allows for simplified data transfer and connection
+ establishment over RDMA (InfiniBand, RoCE, iWarp). Uses BIO-like
+ READ/WRITE semantics and provides multipath capabilities.
+
+config INFINIBAND_IBTRS_SERVER
+   tristate "IBTRS server module"
+   depends on INFINIBAND_ADDR_TRANS
+   select INFINIBAND_IBTRS
+   help
+ IBTRS server module processing connection and IO requests received
+ from the IBTRS client module.
diff --git a/drivers/infiniband/ulp/ibtrs/Makefile 
b/drivers/infiniband/ulp/ibtrs/Makefile
new file mode 100644
index ..e6ea858745ad
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/Makefile
@@ -0,0 +1,15 @@
+ibtrs-client-y := ibtrs-clt.o \
+ ibtrs-clt-stats.o \
+ ibtrs-clt-sysfs.o
+
+ibtrs-server-y := ibtrs-srv.o \
+ ibtrs-srv-stats.o \
+ ibtrs-srv-sysfs.o
+
+ibtrs-core-y := ibtrs.o
+
+obj-$(CONFIG_INFINIBAND_IBTRS)+= ibtrs-core.o
+obj-$(CONFIG_INFINIBAND_IBTRS_CLIENT) += ibtrs-client.o
+obj-$(CONFIG_INFINIBAND_IBTRS_SERVER) += ibtrs-server.o
+
+-include $(src)/compat/compat.mk
-- 
2.13.1

[PATCH v2 22/26] ibnbd: server: functionality for IO submission to file or block dev

2018-05-18 Thread Roman Pen

This provides helper functions for IO submission to file or block dev.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-srv-dev.c | 410 
 drivers/block/ibnbd/ibnbd-srv-dev.h | 149 +
 2 files changed, 559 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-dev.c
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-dev.h

diff --git a/drivers/block/ibnbd/ibnbd-srv-dev.c 
b/drivers/block/ibnbd/ibnbd-srv-dev.c
new file mode 100644
index ..a5894849b9d5
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv-dev.c
@@ -0,0 +1,410 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibnbd-srv-dev.h"
+#include "ibnbd-log.h"
+
+#define IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS 0
+
+struct ibnbd_dev_file_io_work {
+   struct ibnbd_dev*dev;
+   void*priv;
+
+   sector_tsector;
+   void*data;
+   size_t  len;
+   size_t  bi_size;
+   enum ibnbd_io_flags flags;
+
+   struct work_struct  work;
+};
+
+struct ibnbd_dev_blk_io {
+   struct ibnbd_dev *dev;
+   void *priv;
+};
+
+static struct workqueue_struct *fileio_wq;
+
+int ibnbd_dev_init(void)
+{
+   fileio_wq = alloc_workqueue("%s", WQ_UNBOUND,
+   IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS,
+   "ibnbd_server_fileio_wq");
+   if (!fileio_wq)
+   return -ENOMEM;
+
+   return 0;
+}
+
+void ibnbd_dev_destroy(void)
+{
+   destroy_workqueue(fileio_wq);
+}
+
+static inline struct block_device *ibnbd_dev_open_bdev(const char *path,
+  fmode_t flags)
+{
+   return blkdev_get_by_path(path, flags, THIS_MODULE);
+}
+
+static int ibnbd_dev_blk_open(struct ibnbd_dev *dev, const char *path,
+ fmode_t flags)
+{
+   dev->bdev = ibnbd_dev_open_bdev(path, flags);
+   return PTR_ERR_OR_ZERO(dev->bdev);
+}
+
+static int ibnbd_dev_vfs_open(struct ibnbd_dev *dev, const char *path,
+ fmode_t flags)
+{
+   int oflags = O_DSYNC; /* enable write-through */
+
+   if (flags & FMODE_WRITE)
+   oflags |= O_RDWR;
+   else if (flags & FMODE_READ)
+   oflags |= O_RDONLY;
+   else
+   return -EINVAL;
+
+   dev->file = filp_open(path, oflags, 0);
+   return PTR_ERR_OR_ZERO(dev->file);
+}
+
+struct ibnbd_dev *ibnbd_dev_open(const char *path, fmode_t flags,
+enum ibnbd_io_mode mode, struct bio_set *bs,
+ibnbd_dev_io_fn io_cb)
+{
+   struct ibnbd_dev *dev;
+   int ret;
+
+   dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+   if (!dev)
+   return ERR_PTR(-ENOMEM);
+
+   if (mode == IBNBD_BLOCKIO) {
+   dev->blk_open_flags = flags;
+   ret = ibnbd_dev_blk_open(dev, path, dev->blk_open_flags);
+   if (ret)
+   goto err;
+   } else if (mode == IBNBD_FILEIO) {
+   dev->blk_open_flags = FMODE_READ;
+   ret = ibnbd_dev_blk_open(dev, path, dev->blk_open_flags);
+   if (ret)
+   goto err;
+
+   ret = ibnbd_dev_vfs_open(dev, path, flags);
+   if (ret)
+   goto blk_put;
+   }
+
+   dev->blk_open_flags = flags;
+   dev->mode   = mode;
+   dev->io_cb  = io_cb;
+   bdevname(dev->bdev, dev->name);
+   dev->ibd_bio_set= bs;
+
+   return dev;
+
+blk_put:
+   blkdev_put(dev->bdev, dev->blk_open_flags);
+err:
+   kfree(dev);
+   return ERR_PTR(ret);
+}
+
+void ibnbd_dev_close(struct

[PATCH v2 09/26] ibtrs: client: sysfs interface functions

2018-05-18 Thread Roman Pen

This is the sysfs interface to IBTRS sessions on client side:

  /sys/devices/virtual/ibtrs-client//
*** IBTRS session created by ibtrs_clt_open() API call
|
|- max_reconnect_attempts
|  *** number of reconnect attempts for session
|
|- add_path
|  *** adds another connection path into IBTRS session
|
|- paths//
   *** established paths to server in a session
   |
   |- disconnect
   |  *** disconnect path
   |
   |- reconnect
   |  *** reconnect path
   |
   |- remove_path
   |  *** remove current path
   |
   |- state
   |  *** retrieve current path state
   |
   |- hca_port
   |  *** HCA port number
   |
   |- hca_name
   |  *** HCA name
   |
   |- stats/
  *** current path statistics
  |
  |- cpu_migration
  |- rdma
  |- rdma_lat
  |- reconnects
  |- reset_all
  |- sg_entries
  |- wc_completions

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-clt-sysfs.c | 482 +
 1 file changed, 482 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-clt-sysfs.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-clt-sysfs.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-clt-sysfs.c
new file mode 100644
index ..c185bbc4fd5c
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-clt-sysfs.c
@@ -0,0 +1,482 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibtrs-pri.h"
+#include "ibtrs-clt.h"
+#include "ibtrs-log.h"
+
+#define MIN_MAX_RECONN_ATT -1
+#define MAX_MAX_RECONN_ATT 
+
+static struct kobj_type ktype = {
+   .sysfs_ops = &kobj_sysfs_ops,
+};
+
+static ssize_t max_reconnect_attempts_show(struct device *dev,
+  struct device_attribute *attr,
+  char *page)
+{
+   struct ibtrs_clt *clt;
+
+   clt = container_of(dev, struct ibtrs_clt, dev);
+
+   return sprintf(page, "%d\n", ibtrs_clt_get_max_reconnect_attempts(clt));
+}
+
+static ssize_t max_reconnect_attempts_store(struct device *dev,
+   struct device_attribute *attr,
+   const char *buf,
+   size_t count)
+{
+   struct ibtrs_clt *clt;
+   int value;
+   int ret;
+
+   clt = container_of(dev, struct ibtrs_clt, dev);
+
+   ret = kstrtoint(buf, 10, &value);
+   if (unlikely(ret)) {
+   ibtrs_err(clt, "%s: failed to convert string '%s' to int\n",
+ attr->attr.name, buf);
+   return ret;
+   }
+   if (unlikely(value > MAX_MAX_RECONN_ATT ||
+value < MIN_MAX_RECONN_ATT)) {
+   ibtrs_err(clt, "%s: invalid range"
+ " (provided: '%s', accepted: min: %d, max: %d)\n",
+ attr->attr.name, buf, MIN_MAX_RECONN_ATT,
+ MAX_MAX_RECONN_ATT);
+   return -EINVAL;
+   }
+   ibtrs_clt_set_max_reconnect_attempts(clt, value);
+
+   return count;
+}
+
+static DEVICE_ATTR_RW(max_reconnect_attempts);
+
+static ssize_t mpath_policy_show(struct device *dev,
+struct device_attribute *attr,
+char *page)
+{
+   struct ibtrs_clt *clt;
+
+   clt = container_of(dev, struct ibtrs_clt, dev);
+
+   switch (clt->mp_policy) {
+   case MP_POLICY_RR:
+   return sprintf(page, "round-robin (RR: %d)\n", clt->mp_policy);
+   case MP_POLICY_MIN_INFLIGHT:
+   return sprintf(page, "min-inflight (MI: %d)\n", clt->mp_policy);
+   default:
+   return sprintf(page, "Unknown (%d)\n", cl

[PATCH v2 21/26] ibnbd: server: main functionality

2018-05-18 Thread Roman Pen

This is main functionality of ibnbd-server module, which handles IBTRS
events and IBNBD protocol requests, like map (open) or unmap (close)
device.  Also server side is responsible for processing incoming IBTRS
IO requests and forward them to local mapped devices.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-srv.c | 922 
 1 file changed, 922 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv.c

diff --git a/drivers/block/ibnbd/ibnbd-srv.c b/drivers/block/ibnbd/ibnbd-srv.c
new file mode 100644
index ..a42a9191dad9
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv.c
@@ -0,0 +1,922 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+
+#include "ibnbd-srv.h"
+#include "ibnbd-srv-dev.h"
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_VERSION(IBNBD_VER_STRING);
+MODULE_DESCRIPTION("InfiniBand Network Block Device Server");
+MODULE_LICENSE("GPL");
+
+#define DEFAULT_DEV_SEARCH_PATH "/"
+
+static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH;
+
+static int dev_search_path_set(const char *val, const struct kernel_param *kp)
+{
+   char *dup;
+
+   if (strlen(val) >= sizeof(dev_search_path))
+   return -EINVAL;
+
+   dup = kstrdup(val, GFP_KERNEL);
+
+   if (dup[strlen(dup) - 1] == '\n')
+   dup[strlen(dup) - 1] = '\0';
+
+   strlcpy(dev_search_path, dup, sizeof(dev_search_path));
+
+   kfree(dup);
+   pr_info("dev_search_path changed to '%s'\n", dev_search_path);
+
+   return 0;
+}
+
+static struct kparam_string dev_search_path_kparam_str = {
+   .maxlen = sizeof(dev_search_path),
+   .string = dev_search_path
+};
+
+static const struct kernel_param_ops dev_search_path_ops = {
+   .set= dev_search_path_set,
+   .get= param_get_string,
+};
+
+module_param_cb(dev_search_path, &dev_search_path_ops,
+   &dev_search_path_kparam_str, 0444);
+MODULE_PARM_DESC(dev_search_path, "Sets the dev_search_path."
+" When a device is mapped this path is prepended to the"
+" device path from the map device operation.  If %SESSNAME%"
+" is specified in a path, then device will be searched in a"
+" session namespace."
+" (default: " DEFAULT_DEV_SEARCH_PATH ")");
+
+static int def_io_mode = IBNBD_BLOCKIO;
+module_param(def_io_mode, int, 0444);
+MODULE_PARM_DESC(def_io_mode, "By default, export devices in"
+" blockio(" __stringify(_IBNBD_BLOCKIO) ") or"
+" fileio(" __stringify(_IBNBD_FILEIO) ") mode."
+" (default: " __stringify(_IBNBD_BLOCKIO) " (blockio))");
+
+static DEFINE_MUTEX(sess_lock);
+static DEFINE_SPINLOCK(dev_lock);
+
+static LIST_HEAD(sess_list);
+static LIST_HEAD(dev_list);
+
+struct ibnbd_io_private {
+   struct ibtrs_srv_op *id;
+   struct ibnbd_srv_sess_dev   *sess_dev;
+};
+
+static void ibnbd_sess_dev_release(struct kref *kref)
+{
+   struct ibnbd_srv_sess_dev *sess_dev;
+
+   sess_dev = container_of(kref, struct ibnbd_srv_sess_dev, kref);
+   complete(sess_dev->destroy_comp);
+}
+
+static inline void ibnbd_put_sess_dev(struct ibnbd_srv_sess_dev *sess_dev)
+{
+   kref_put(&sess_dev->kref, ibnbd_sess_dev_release);
+}
+
+static void ibnbd_endio(void *priv, int error)
+{
+   struct ibnbd_io_private *ibnbd_priv = priv;
+   struct ibnbd_srv_sess_dev *sess_dev = ibnbd_priv->sess_dev;
+
+   ibnbd_put_sess_dev(sess_dev);
+
+   ibtrs_srv_resp_rdma(ibnbd_priv->id, error);
+
+   kfree(priv);
+}
+
+static struct ibnbd_srv_sess_dev *
+ibnbd_get_sess_dev(int dev_id, struct ibnbd_srv_session *srv_sess)
+{
+   struct ibnbd_srv_sess_dev *sess_dev;
+   int ret = 0;
+
+   read_lock(&srv_sess->index_loc

[PATCH v2 26/26] MAINTAINERS: Add maintainer for IBNBD/IBTRS modules

2018-05-18 Thread Roman Pen

Signed-off-by: Roman Pen 
Cc: Danil Kipnis 
Cc: Jack Wang 
---
 MAINTAINERS | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 92be777d060a..e5a001bd0f05 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6786,6 +6786,20 @@ IBM ServeRAID RAID DRIVER
 S: Orphan
 F: drivers/scsi/ips.*
 
+IBNBD BLOCK DRIVERS
+M: IBNBD/IBTRS Storage Team 
+L: linux-block@vger.kernel.org
+S: Maintained
+T: git git://github.com/profitbricks/ibnbd.git
+F: drivers/block/ibnbd/
+
+IBTRS TRANSPORT DRIVERS
+M: IBNBD/IBTRS Storage Team 
+L: linux-r...@vger.kernel.org
+S: Maintained
+T: git git://github.com/profitbricks/ibnbd.git
+F: drivers/infiniband/ulp/ibtrs/
+
 ICH LPC AND GPIO DRIVER
 M: Peter Tyser 
 S: Maintained
-- 
2.13.1

[PATCH v2 24/26] ibnbd: include client and server modules into kernel compilation

2018-05-18 Thread Roman Pen

Add IBNBD Makefile, Kconfig and also corresponding lines into upper
block layer files.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/Kconfig|  2 ++
 drivers/block/Makefile   |  1 +
 drivers/block/ibnbd/Kconfig  | 22 ++
 drivers/block/ibnbd/Makefile | 13 +
 4 files changed, 38 insertions(+)
 create mode 100644 drivers/block/ibnbd/Kconfig
 create mode 100644 drivers/block/ibnbd/Makefile

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ad9b687a236a..d8c1590411c8 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -481,4 +481,6 @@ config BLK_DEV_RSXX
  To compile this driver as a module, choose M here: the
  module will be called rsxx.
 
+source "drivers/block/ibnbd/Kconfig"
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index dc061158b403..65346a1d0b1a 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IBNBD)+= ibnbd/
 
 skd-y  := skd_main.o
 swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/ibnbd/Kconfig b/drivers/block/ibnbd/Kconfig
new file mode 100644
index ..b381c6c084d2
--- /dev/null
+++ b/drivers/block/ibnbd/Kconfig
@@ -0,0 +1,22 @@
+config BLK_DEV_IBNBD
+   bool
+
+config BLK_DEV_IBNBD_CLIENT
+   tristate "Network block device driver on top of IBTRS transport"
+   depends on INFINIBAND_IBTRS_CLIENT
+   select BLK_DEV_IBNBD
+   help
+ IBNBD client allows for mapping of a remote block devices over
+ IBTRS protocol from a target system where IBNBD server is running.
+
+ If unsure, say N.
+
+config BLK_DEV_IBNBD_SERVER
+   tristate "Network block device over RDMA Infiniband server support"
+   depends on INFINIBAND_IBTRS_SERVER
+   select BLK_DEV_IBNBD
+   help
+ IBNBD server allows for exporting local block devices to a remote 
client
+ over IBTRS protocol.
+
+ If unsure, say N.
diff --git a/drivers/block/ibnbd/Makefile b/drivers/block/ibnbd/Makefile
new file mode 100644
index ..5f20e72e0633
--- /dev/null
+++ b/drivers/block/ibnbd/Makefile
@@ -0,0 +1,13 @@
+ccflags-y := -Idrivers/infiniband/ulp/ibtrs
+
+ibnbd-client-y := ibnbd-clt.o \
+ ibnbd-clt-sysfs.o
+
+ibnbd-server-y := ibnbd-srv.o \
+ ibnbd-srv-dev.o \
+ ibnbd-srv-sysfs.o
+
+obj-$(CONFIG_BLK_DEV_IBNBD_CLIENT) += ibnbd-client.o
+obj-$(CONFIG_BLK_DEV_IBNBD_SERVER) += ibnbd-server.o
+
+-include $(src)/compat/compat.mk
-- 
2.13.1

[PATCH v2 20/26] ibnbd: server: private header with server structs and functions

2018-05-18 Thread Roman Pen

This header describes main structs and functions used by ibnbd-server
module, namely structs for managing sessions from different clients
and mapped (opened) devices.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-srv.h | 100 
 1 file changed, 100 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv.h

diff --git a/drivers/block/ibnbd/ibnbd-srv.h b/drivers/block/ibnbd/ibnbd-srv.h
new file mode 100644
index ..191a1650bc1d
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv.h
@@ -0,0 +1,100 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBNBD_SRV_H
+#define IBNBD_SRV_H
+
+#include 
+#include 
+#include 
+
+#include "ibtrs.h"
+#include "ibnbd-proto.h"
+#include "ibnbd-log.h"
+
+struct ibnbd_srv_session {
+   /* Entry inside global sess_list */
+   struct list_headlist;
+   struct ibtrs_srv*ibtrs;
+   charsessname[NAME_MAX];
+   int queue_depth;
+   struct bio_set  *sess_bio_set;
+
+   rwlock_tindex_lock cacheline_aligned;
+   struct idr  index_idr;
+   /* List of struct ibnbd_srv_sess_dev */
+   struct list_headsess_dev_list;
+   struct mutexlock;
+   u8  ver;
+};
+
+struct ibnbd_srv_dev {
+   /* Entry inside global dev_list */
+   struct list_headlist;
+   struct kobject  dev_kobj;
+   struct kobject  dev_sessions_kobj;
+   struct kref kref;
+   charid[NAME_MAX];
+   /* List of ibnbd_srv_sess_dev structs */
+   struct list_headsess_dev_list;
+   struct mutexlock;
+   int open_write_cnt;
+   enum ibnbd_io_mode  mode;
+};
+
+/* Structure which binds N devices and N sessions */
+struct ibnbd_srv_sess_dev {
+   /* Entry inside ibnbd_srv_dev struct */
+   struct list_headdev_list;
+   /* Entry inside ibnbd_srv_session struct */
+   struct list_headsess_list;
+   struct ibnbd_dev*ibnbd_dev;
+   struct ibnbd_srv_session*sess;
+   struct ibnbd_srv_dev*dev;
+   struct kobject  kobj;
+   struct completion   *sysfs_release_compl;
+   u32 device_id;
+   fmode_t open_flags;
+   struct kref kref;
+   struct completion   *destroy_comp;
+   charpathname[NAME_MAX];
+};
+
+/* ibnbd-srv-sysfs.c */
+
+int ibnbd_srv_create_dev_sysfs(struct ibnbd_srv_dev *dev,
+  struct block_device *bdev,
+  const char *dir_name);
+void ibnbd_srv_destroy_dev_sysfs(struct ibnbd_srv_dev *dev);
+int ibnbd_srv_create_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev);
+void ibnbd_srv_destroy_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev);
+int ibnbd_srv_create_sysfs_files(void);
+void ibnbd_srv_destroy_sysfs_files(void);
+
+#endif /* IBNBD_SRV_H */
-- 
2.13.1

[PATCH v2 23/26] ibnbd: server: sysfs interface functions

2018-05-18 Thread Roman Pen

This is the sysfs interface to IBNBD mapped devices on server side:

  /sys/devices/virtual/ibnbd-server/ctl/devices//
|- block_dev
|  *** link pointing to the corresponding block device sysfs entry
|
|- sessions//
|  *** sessions directory
   |
   |- read_only
   |  *** is devices mapped as read only
   |
   |- mapping_path
  *** relative device path provided by the client during mapping

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-srv-sysfs.c | 242 ++
 1 file changed, 242 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-sysfs.c

diff --git a/drivers/block/ibnbd/ibnbd-srv-sysfs.c 
b/drivers/block/ibnbd/ibnbd-srv-sysfs.c
new file mode 100644
index ..5bf77cdb09c8
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv-sysfs.c
@@ -0,0 +1,242 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ibnbd-srv.h"
+
+static struct device *ibnbd_dev;
+static struct class *ibnbd_dev_class;
+static struct kobject *ibnbd_devs_kobj;
+
+static struct attribute *ibnbd_srv_default_dev_attrs[] = {
+   NULL,
+};
+
+static struct attribute_group ibnbd_srv_default_dev_attr_group = {
+   .attrs = ibnbd_srv_default_dev_attrs,
+};
+
+static struct kobj_type ktype = {
+   .sysfs_ops  = &kobj_sysfs_ops,
+};
+
+int ibnbd_srv_create_dev_sysfs(struct ibnbd_srv_dev *dev,
+  struct block_device *bdev,
+  const char *dir_name)
+{
+   struct kobject *bdev_kobj;
+   int ret;
+
+   ret = kobject_init_and_add(&dev->dev_kobj, &ktype,
+  ibnbd_devs_kobj, dir_name);
+   if (ret)
+   return ret;
+
+   ret = kobject_init_and_add(&dev->dev_sessions_kobj,
+  &ktype,
+  &dev->dev_kobj, "sessions");
+   if (ret)
+   goto err;
+
+   ret = sysfs_create_group(&dev->dev_kobj,
+&ibnbd_srv_default_dev_attr_group);
+   if (ret)
+   goto err2;
+
+   bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj;
+   ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev");
+   if (ret)
+   goto err3;
+
+   return 0;
+
+err3:
+   sysfs_remove_group(&dev->dev_kobj,
+  &ibnbd_srv_default_dev_attr_group);
+err2:
+   kobject_del(&dev->dev_sessions_kobj);
+   kobject_put(&dev->dev_sessions_kobj);
+err:
+   kobject_del(&dev->dev_kobj);
+   kobject_put(&dev->dev_kobj);
+   return ret;
+}
+
+void ibnbd_srv_destroy_dev_sysfs(struct ibnbd_srv_dev *dev)
+{
+   sysfs_remove_link(&dev->dev_kobj, "block_dev");
+   sysfs_remove_group(&dev->dev_kobj, &ibnbd_srv_default_dev_attr_group);
+   kobject_del(&dev->dev_sessions_kobj);
+   kobject_put(&dev->dev_sessions_kobj);
+   kobject_del(&dev->dev_kobj);
+   kobject_put(&dev->dev_kobj);
+}
+
+static ssize_t ibnbd_srv_dev_session_ro_show(struct kobject *kobj,
+struct kobj_attribute *attr,
+char *page)
+{
+   struct ibnbd_srv_sess_dev *sess_dev;
+
+   sess_dev = container_of(kobj, struct ibnbd_srv_sess_dev, kobj);
+
+   return scnprintf(page, PAGE_SIZE, "%s\n",
+(sess_dev->open_flags & FMODE_WRITE) ? "0" : "1");
+}
+
+static struct kobj_attribute ibnbd_srv_dev_session_ro_attr =
+   __ATTR(read_only, 0444,
+  ibnbd_srv_dev_session_ro_show,
+  NULL);
+
+static ssize_t
+ibnbd_srv_dev_session_mapping_path_show(struct kobject *kobj,
+   struct kobj_attribu

[PATCH v2 18/26] ibnbd: client: main functionality

2018-05-18 Thread Roman Pen

This is main functionality of ibnbd-client module, which provides
interface to map remote device as local block device /dev/ibnbd
and feeds IBTRS with IO requests.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/block/ibnbd/ibnbd-clt.c | 1819 +++
 1 file changed, 1819 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt.c

diff --git a/drivers/block/ibnbd/ibnbd-clt.c b/drivers/block/ibnbd/ibnbd-clt.c
new file mode 100644
index ..06524e33e19f
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt.c
@@ -0,0 +1,1819 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "ibnbd-clt.h"
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_DESCRIPTION("InfiniBand Network Block Device Client");
+MODULE_VERSION(IBNBD_VER_STRING);
+MODULE_LICENSE("GPL");
+
+/*
+ * This is for closing devices when unloading the module:
+ * we might be closing a lot (>256) of devices in parallel
+ * and it is better not to use the system_wq.
+ */
+static struct workqueue_struct *unload_wq;
+static int ibnbd_client_major;
+static DEFINE_IDA(index_ida);
+static DEFINE_MUTEX(ida_lock);
+static DEFINE_MUTEX(sess_lock);
+static LIST_HEAD(sess_list);
+
+static bool softirq_enable;
+module_param(softirq_enable, bool, 0444);
+MODULE_PARM_DESC(softirq_enable, "finish request in softirq_fn."
+" (default: 0)");
+/*
+ * Maximum number of partitions an instance can have.
+ * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
+ */
+#define IBNBD_PART_BITS6
+#define KERNEL_SECTOR_SIZE  512
+
+static inline bool ibnbd_clt_get_sess(struct ibnbd_clt_session *sess)
+{
+   return refcount_inc_not_zero(&sess->refcount);
+}
+
+static void free_sess(struct ibnbd_clt_session *sess);
+
+static void ibnbd_clt_put_sess(struct ibnbd_clt_session *sess)
+{
+   might_sleep();
+
+   if (refcount_dec_and_test(&sess->refcount))
+   free_sess(sess);
+}
+
+static inline bool ibnbd_clt_dev_is_mapped(struct ibnbd_clt_dev *dev)
+{
+   return dev->dev_state == DEV_STATE_MAPPED;
+}
+
+static void ibnbd_clt_put_dev(struct ibnbd_clt_dev *dev)
+{
+   might_sleep();
+
+   if (refcount_dec_and_test(&dev->refcount)) {
+   mutex_lock(&ida_lock);
+   ida_simple_remove(&index_ida, dev->clt_device_id);
+   mutex_unlock(&ida_lock);
+   kfree(dev->hw_queues);
+   ibnbd_clt_put_sess(dev->sess);
+   kfree(dev);
+   }
+}
+
+static inline bool ibnbd_clt_get_dev(struct ibnbd_clt_dev *dev)
+{
+   return refcount_inc_not_zero(&dev->refcount);
+}
+
+static int ibnbd_clt_set_dev_attr(struct ibnbd_clt_dev *dev,
+ const struct ibnbd_msg_open_rsp *rsp)
+{
+   struct ibnbd_clt_session *sess = dev->sess;
+
+   if (unlikely(!rsp->logical_block_size))
+   return -EINVAL;
+
+   dev->device_id  = le32_to_cpu(rsp->device_id);
+   dev->nsectors   = le64_to_cpu(rsp->nsectors);
+   dev->logical_block_size = le16_to_cpu(rsp->logical_block_size);
+   dev->physical_block_size= le16_to_cpu(rsp->physical_block_size);
+   dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors);
+   dev->max_discard_sectors= le32_to_cpu(rsp->max_discard_sectors);
+   dev->discard_granularity= le32_to_cpu(rsp->discard_granularity);
+   dev->discard_alignment  = le32_to_cpu(rsp->discard_alignment);
+   dev->secure_discard = le16_to_cpu(rsp->secure_discard);
+   dev->rotational = rsp->rotational;
+   dev->remote_io_mode = rsp->io_mode;
+
+   dev->max_hw_sectors = sess->max_io_size / dev->logical_block_size;

[PATCH v2 04/26] ibtrs: private headers with IBTRS protocol structs and helpers

2018-05-18 Thread Roman Pen

These are common private headers with IBTRS protocol structures,
logging, sysfs and other helper functions, which are used on
both client and server sides.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-log.h |  91 ++
 drivers/infiniband/ulp/ibtrs/ibtrs-pri.h | 459 +++
 2 files changed, 550 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-log.h
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-pri.h

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-log.h 
b/drivers/infiniband/ulp/ibtrs/ibtrs-log.h
new file mode 100644
index ..f56257eabdee
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-log.h
@@ -0,0 +1,91 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBTRS_LOG_H
+#define IBTRS_LOG_H
+
+#define P1 )
+#define P2 ))
+#define P3 )))
+#define P4 
+#define P(N) P ## N
+
+#define CAT(a, ...) PRIMITIVE_CAT(a, __VA_ARGS__)
+#define PRIMITIVE_CAT(a, ...) a ## __VA_ARGS__
+
+#define LIST(...)  \
+   __VA_ARGS__,\
+   ({ unknown_type(); NULL; }) \
+   CAT(P, COUNT_ARGS(__VA_ARGS__)) \
+
+#define EMPTY()
+#define DEFER(id) id EMPTY()
+
+#define _CASE(obj, type, member)   \
+   __builtin_choose_expr(  \
+   __builtin_types_compatible_p(   \
+   typeof(obj), type), \
+   ((type)obj)->member
+#define CASE(o, t, m) DEFER(_CASE)(o,t,m)
+
+/*
+ * Below we define retrieving of sessname from common IBTRS types.
+ * Client or server related types have to be defined by special
+ * TYPES_TO_SESSNAME macro.
+ */
+
+void unknown_type(void);
+
+#ifndef TYPES_TO_SESSNAME
+#define TYPES_TO_SESSNAME(...) ({ unknown_type(); NULL; })
+#endif
+
+#define ibtrs_prefix(obj)  \
+   _CASE(obj, struct ibtrs_con *,  sess->sessname),\
+   _CASE(obj, struct ibtrs_sess *, sessname),  \
+   TYPES_TO_SESSNAME(obj)  \
+   ))
+
+#define ibtrs_log(fn, obj, fmt, ...)   \
+   fn("<%s>: " fmt, ibtrs_prefix(obj), ##__VA_ARGS__)
+
+#define ibtrs_err(obj, fmt, ...)   \
+   ibtrs_log(pr_err, obj, fmt, ##__VA_ARGS__)
+#define ibtrs_err_rl(obj, fmt, ...)\
+   ibtrs_log(pr_err_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define ibtrs_wrn(obj, fmt, ...)   \
+   ibtrs_log(pr_warn, obj, fmt, ##__VA_ARGS__)
+#define ibtrs_wrn_rl(obj, fmt, ...) \
+   ibtrs_log(pr_warn_ratelimited, obj, fmt, ##__VA_ARGS__)
+#define ibtrs_info(obj, fmt, ...) \
+   ibtrs_log(pr_info, obj, fmt, ##__VA_ARGS__)
+#define ibtrs_info_rl(obj, fmt, ...) \
+   ibtrs_log(pr_info_ratelimited, obj, fmt, ##__VA_ARGS__)
+
+#endif /* IBTRS_LOG_H */
diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-pri.h 
b/drivers/infiniband/ulp/ibtrs/ibtrs-pri.h
new file mode 100644
index ..40647f066840
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-pri.h
@@ -0,0 +1,459 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope

[PATCH v2 03/26] ibtrs: public interface header to establish RDMA connections

2018-05-18 Thread Roman Pen

Introduce public header which provides set of API functions to
establish RDMA connections from client to server machine using
IBTRS protocol, which manages RDMA connections for each session,
does multipathing and load balancing.

Main functions for client (active) side:

 ibtrs_clt_open() - Creates set of RDMA connections incapsulated
in IBTRS session and returns pointer on IBTRS
session object.
 ibtrs_clt_close() - Closes RDMA connections associated with IBTRS
 session.
 ibtrs_clt_request() - Requests zero-copy RDMA transfer to/from
   server.

Main functions for server (passive) side:

 ibtrs_srv_open() - Starts listening for IBTRS clients on specified
port and invokes IBTRS callbacks for incoming
RDMA requests or link events.
 ibtrs_srv_close() - Closes IBTRS server context.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs.h | 324 +++
 1 file changed, 324 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs.h

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs.h 
b/drivers/infiniband/ulp/ibtrs/ibtrs.h
new file mode 100644
index ..08325e39a41e
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs.h
@@ -0,0 +1,324 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBTRS_H
+#define IBTRS_H
+
+#include 
+#include 
+
+struct ibtrs_tag;
+struct ibtrs_clt;
+struct ibtrs_srv_ctx;
+struct ibtrs_srv;
+struct ibtrs_srv_op;
+
+/*
+ * Here goes IBTRS client API
+ */
+
+/**
+ * enum ibtrs_clt_link_ev - Events about connectivity state of a client
+ * @IBTRS_CLT_LINK_EV_RECONNECTED  Client was reconnected.
+ * @IBTRS_CLT_LINK_EV_DISCONNECTED Client was disconnected.
+ */
+enum ibtrs_clt_link_ev {
+   IBTRS_CLT_LINK_EV_RECONNECTED,
+   IBTRS_CLT_LINK_EV_DISCONNECTED,
+};
+
+/**
+ * Source and destination address of a path to be established
+ */
+struct ibtrs_addr {
+   struct sockaddr_storage *src;
+   struct sockaddr_storage *dst;
+};
+
+typedef void (link_clt_ev_fn)(void *priv, enum ibtrs_clt_link_ev ev);
+/**
+ * ibtrs_clt_open() - Open a session to a IBTRS client
+ * @priv:  User supplied private data.
+ * @link_ev:   Event notification for connection state changes
+ * @priv:  user supplied data that was passed to
+ * ibtrs_clt_open()
+ * @ev:Occurred event
+ * @sessname: name of the session
+ * @paths: Paths to be established defined by their src and dst addresses
+ * @path_cnt: Number of elemnts in the @paths array
+ * @port: port to be used by the IBTRS session
+ * @pdu_sz: Size of extra payload which can be accessed after tag allocation.
+ * @max_inflight_msg: Max. number of parallel inflight messages for the session
+ * @max_segments: Max. number of segments per IO request
+ * @reconnect_delay_sec: time between reconnect tries
+ * @max_reconnect_attempts: Number of times to reconnect on error before giving
+ * up, 0 for * disabled, -1 for forever
+ *
+ * Starts session establishment with the ibtrs_server. The function can block
+ * up to ~2000ms until it returns.
+ *
+ * Return a valid pointer on success otherwise PTR_ERR.
+ */
+struct ibtrs_clt *ibtrs_clt_open(void *priv, link_clt_ev_fn *link_ev,
+const char *sessname,
+const struct ibtrs_addr *paths,
+size_t path_cnt, short port,
+size_t pdu_sz, u8 reconnect_delay_sec,
+u16 max_segments,
+s16 max_reconnect_attempts);
+
+/**
+ * ibtrs_clt_close() - Close a session
+ * @sess: Session handler, is freed on return
+ */
+void ibtrs_clt_close(struct ibtrs_clt *sess);
+
+/**
+ * ibtrs_tag_from_pd

[PATCH v2 00/26] InfiniBand Transport (IBTRS) and Network Block Device (IBNBD)

2018-05-18 Thread Roman Pen

Hi all,

This is v2 of series, which introduces IBNBD/IBTRS modules.

This cover letter is split on three parts:

1. Introduction, which almost repeats everything from previous cover
   letters.
2. Changelog.
3. Performance measurements on linux-4.17.0-rc2 and on two different
   Mellanox cards: ConnectX-2 and ConnectX-3 and CPUs: Intel and AMD.


 Introduction
 -

IBTRS (InfiniBand Transport) is a reliable high speed transport library
which allows for establishing connection between client and server
machines via RDMA. It is optimized to transfer (read/write) IO blocks
in the sense that it follows the BIO semantics of providing the
possibility to either write data from a scatter-gather list to the
remote side or to request ("read") data transfer from the remote side
into a given set of buffers.

IBTRS is multipath capalbdke and provides I/O fail-over and load-balancing
functionality, i.e. in IBTRS terminology, an IBTRS path is a set of RDMA
CMs and particular path is selected according to the load-balancing policy.

IBNBD (InfiniBand Network Block Device) is a pair of kernel modules
(client and server) that allow for remote access of a block device on
the server over IBTRS protocol. After being mapped, the remote block
devices can be accessed on the client side as local block devices.
Internally IBNBD uses IBTRS as an RDMA transport library.

Why?

   - IBNBD/IBTRS is developed in order to map thin provisioned volumes,
 thus internal protocol is simple.
   - IBTRS was developed as an independent RDMA transport library, which
 supports fail-over and load-balancing policies using multipath, thus
 it can be used for any other IO needs rather than only for block
 device.
   - IBNBD/IBTRS is faster than NVME over RDMA.
 Old comparison results:
 https://www.spinics.net/lists/linux-rdma/msg48799.html
 New comparison results: see performance measurements section below.

Key features of IBTRS transport library and IBNBD block device:

o High throughput and low latency due to:
   - Only two RDMA messages per IO.
   - IMM InfiniBand messages on responses to reduce round trip latency.
   - Simplified memory management: memory allocation happens once on
 server side when IBTRS session is established.

o IO fail-over and load-balancing by using multipath.  According to
  our test loads additional path brings ~20% of bandwidth.  

o Simple configuration of IBNBD:
   - Server side is completely passive: volumes do not need to be
 explicitly exported.
   - Only IB port GID and device path needed on client side to map
 a block device.
   - A device is remapped automatically i.e. after storage reboot.

Commits for kernel can be found here:
   https://github.com/profitbricks/ibnbd/commits/linux-4.17-rc2

The out-of-tree modules are here:
   https://github.com/profitbricks/ibnbd/

Vault 2017 presentation:
   
http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf


 Changelog
 -

v2:
  o IBNBD:
 - No legacy request IO mode, only MQ is left.

  o IBTRS:
 - No FMR registration, only FR is left.

 - By default memory is always registered for the sake of the security,
   i.e. by default no pd is created with IB_PD_UNSAFE_GLOBAL_RKEY.

 - Server side (target) always does memory registration and exchanges
   MRs dma addresses with client for direct writes from client side.

 - Client side (initiator) has `noreg_cnt` module option, which 
specifies
   sg number, from which read IO should be registered.  By default 0
   is set, i.e. always register memory for read IOs. (IBTRS protocol
   does not require registration for writes, which always go directly
   to server memory).

 - Proper DMA sync with ib_dma_sync_single_for_(cpu|device) calls.

 - Do signalled IB_WR_LOCAL_INV.

 - Avoid open-coding of string conversion to IPv4/6 sockaddr,
   inet_pton_with_scope() is used instead.

 - Introduced block device namespaces configuration on server side
   (target) to avoid security gap in not trusted environment, when
   client can map a block device which does not belong to him.
   When device namespaces are enabled on server side, server opens
   device using client's session name in the device path, where
   session name is a random token, e.g. GUID.  If server is configured
   to find device namespaces in a folder /run/ibnbd-guid/, then
   request to map device 'sda1' from client with session 'A' (or any
   token) will be resolved by path /run/ibnbd-guid/A/sda1.

 - README is extended with description of IBTRS and IBNBD protocol,
   e.g. how IB IMM field is used to acknowledge IO requests or
   heartbeats.

 - IBTRS/IBNBD client and server modules are registered as devices in
   the kernel in order to have all sysfs configuration entries under

[PATCH v2 02/26] sysfs: export sysfs_remove_file_self()

2018-05-18 Thread Roman Pen

Function is going to be used in transport over RDMA module
in subsequent patches.

Signed-off-by: Roman Pen 
Cc: Tejun Heo 
Cc: linux-ker...@vger.kernel.org
---
 fs/sysfs/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5c13f29bfcdb..ff7443ac2aa7 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -444,6 +444,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const 
struct attribute *attr)
kernfs_put(kn);
return ret;
 }
+EXPORT_SYMBOL_GPL(sysfs_remove_file_self);
 
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
-- 
2.13.1

[PATCH v2 01/26] rculist: introduce list_next_or_null_rr_rcu()

2018-05-18 Thread Roman Pen

Function is going to be used in transport over RDMA module
in subsequent patches.

Function returns next element in round-robin fashion,
i.e. head will be skipped.  NULL will be returned if list
is observed as empty.

Signed-off-by: Roman Pen 
Cc: Paul E. McKenney 
Cc: linux-ker...@vger.kernel.org
---
 include/linux/rculist.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 127f534fec94..b0840d5ab25a 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -339,6 +339,25 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
 })
 
 /**
+ * list_next_or_null_rr_rcu - get next list element in round-robin fashion.
+ * @head:  the head for the list.
+ * @ptr:the list head to take the next element from.
+ * @type:   the type of the struct this is embedded in.
+ * @memb:   the name of the list_head within the struct.
+ *
+ * Next element returned in round-robin fashion, i.e. head will be skipped,
+ * but if list is observed as empty, NULL will be returned.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by 
rcu_read_lock().
+ */
+#define list_next_or_null_rr_rcu(head, ptr, type, memb) \
+({ \
+   list_next_or_null_rcu(head, ptr, type, memb) ?: \
+   list_next_or_null_rcu(head, READ_ONCE((ptr)->next), type, 
memb); \
+})
+
+/**
  * list_for_each_entry_rcu -   iterate over rcu list of given type
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
-- 
2.13.1

[PATCH v2 06/26] ibtrs: client: private header with client structs and functions

2018-05-18 Thread Roman Pen

This header describes main structs and functions used by ibtrs-client
module, mainly for managing IBTRS sessions, creating/destroying sysfs
entries, accounting statistics on client side.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs-clt.h | 315 +++
 1 file changed, 315 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs-clt.h

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-clt.h 
b/drivers/infiniband/ulp/ibtrs/ibtrs-clt.h
new file mode 100644
index ..0323da91ca01
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-clt.h
@@ -0,0 +1,315 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *  Swapnil Ingle 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef IBTRS_CLT_H
+#define IBTRS_CLT_H
+
+#include 
+#include "ibtrs-pri.h"
+
+/**
+ * enum ibtrs_clt_state - Client states.
+ */
+enum ibtrs_clt_state {
+   IBTRS_CLT_CONNECTING,
+   IBTRS_CLT_CONNECTING_ERR,
+   IBTRS_CLT_RECONNECTING,
+   IBTRS_CLT_CONNECTED,
+   IBTRS_CLT_CLOSING,
+   IBTRS_CLT_CLOSED,
+   IBTRS_CLT_DEAD,
+};
+
+static inline const char *ibtrs_clt_state_str(enum ibtrs_clt_state state)
+{
+   switch (state) {
+   case IBTRS_CLT_CONNECTING:
+   return "IBTRS_CLT_CONNECTING";
+   case IBTRS_CLT_CONNECTING_ERR:
+   return "IBTRS_CLT_CONNECTING_ERR";
+   case IBTRS_CLT_RECONNECTING:
+   return "IBTRS_CLT_RECONNECTING";
+   case IBTRS_CLT_CONNECTED:
+   return "IBTRS_CLT_CONNECTED";
+   case IBTRS_CLT_CLOSING:
+   return "IBTRS_CLT_CLOSING";
+   case IBTRS_CLT_CLOSED:
+   return "IBTRS_CLT_CLOSED";
+   case IBTRS_CLT_DEAD:
+   return "IBTRS_CLT_DEAD";
+   default:
+   return "UNKNOWN";
+   }
+}
+
+enum ibtrs_mp_policy {
+   MP_POLICY_RR,
+   MP_POLICY_MIN_INFLIGHT,
+};
+
+struct ibtrs_clt_stats_reconnects {
+   int successful_cnt;
+   int fail_cnt;
+};
+
+struct ibtrs_clt_stats_wc_comp {
+   u32 cnt;
+   u64 total_cnt;
+};
+
+struct ibtrs_clt_stats_cpu_migr {
+   atomic_t from;
+   int to;
+};
+
+struct ibtrs_clt_stats_rdma {
+   struct {
+   u64 cnt;
+   u64 size_total;
+   } dir[2];
+
+   u64 failover_cnt;
+};
+
+struct ibtrs_clt_stats_rdma_lat {
+   u64 read;
+   u64 write;
+};
+
+#define MIN_LOG_SG 2
+#define MAX_LOG_SG 5
+#define MAX_LIN_SG BIT(MIN_LOG_SG)
+#define SG_DISTR_SZ (MAX_LOG_SG - MIN_LOG_SG + MAX_LIN_SG + 2)
+
+#define MAX_LOG_LAT 16
+#define MIN_LOG_LAT 0
+#define LOG_LAT_SZ (MAX_LOG_LAT - MIN_LOG_LAT + 2)
+
+struct ibtrs_clt_stats_pcpu {
+   struct ibtrs_clt_stats_cpu_migr cpu_migr;
+   struct ibtrs_clt_stats_rdma rdma;
+   u64 sg_list_total;
+   u64 sg_list_distr[SG_DISTR_SZ];
+   struct ibtrs_clt_stats_rdma_lat rdma_lat_distr[LOG_LAT_SZ];
+   struct ibtrs_clt_stats_rdma_lat rdma_lat_max;
+   struct ibtrs_clt_stats_wc_comp  wc_comp;
+};
+
+struct ibtrs_clt_stats {
+   boolenable_rdma_lat;
+   struct ibtrs_clt_stats_pcpu__percpu *pcpu_stats;
+   struct ibtrs_clt_stats_reconnects   reconnects;
+   atomic_tinflight;
+};
+
+struct ibtrs_clt_con {
+   struct ibtrs_conc;
+   unsignedcpu;
+   atomic_tio_cnt;
+   int cm_err;
+};
+
+/**
+ * ibtrs_tag - tags the memory allocation for future RDMA operation
+ */
+struct ibtrs_tag {
+   enum ibtrs_clt_con_type con_type;
+   unsigned int cpu_id;
+   unsigned int mem_id;
+   unsigned int mem_off;
+};
+
+struct ibtrs_clt_io_req {
+   struct list_headlist;
+   struct ibtrs_iu *iu;
+   struct scatterlist  *sglist; /*

[PATCH v2 05/26] ibtrs: core: lib functions shared between client and server modules

2018-05-18 Thread Roman Pen

This is a set of library functions existing as a ibtrs-core module,
used by client and server modules.

Mainly these functions wrap IB and RDMA calls and provide a bit higher
abstraction for implementing of IBTRS protocol on client or server
sides.

Signed-off-by: Roman Pen 
Signed-off-by: Danil Kipnis 
Cc: Jack Wang 
---
 drivers/infiniband/ulp/ibtrs/ibtrs.c | 609 +++
 1 file changed, 609 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/ibtrs.c

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs.c
new file mode 100644
index ..39a933fe528e
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs.c
@@ -0,0 +1,609 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ *  Kleber Souza 
+ *  Danil Kipnis 
+ *  Roman Penyaev 
+ *  Milind Dumbare 
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis 
+ *  Roman Penyaev 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include 
+#include 
+
+#include "ibtrs-pri.h"
+#include "ibtrs-log.h"
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_DESCRIPTION("IBTRS Core");
+MODULE_VERSION(IBTRS_VER_STRING);
+MODULE_LICENSE("GPL");
+
+struct ibtrs_iu *ibtrs_iu_alloc(u32 tag, size_t size, gfp_t gfp_mask,
+   struct ib_device *dma_dev,
+   enum dma_data_direction direction,
+   void (*done)(struct ib_cq *cq,
+struct ib_wc *wc))
+{
+   struct ibtrs_iu *iu;
+
+   iu = kmalloc(sizeof(*iu), gfp_mask);
+   if (unlikely(!iu))
+   return NULL;
+
+   iu->buf = kzalloc(size, gfp_mask);
+   if (unlikely(!iu->buf))
+   goto err1;
+
+   iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, direction);
+   if (unlikely(ib_dma_mapping_error(dma_dev, iu->dma_addr)))
+   goto err2;
+
+   iu->cqe.done  = done;
+   iu->size  = size;
+   iu->direction = direction;
+   iu->tag   = tag;
+
+   return iu;
+
+err2:
+   kfree(iu->buf);
+err1:
+   kfree(iu);
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(ibtrs_iu_alloc);
+
+void ibtrs_iu_free(struct ibtrs_iu *iu, enum dma_data_direction dir,
+  struct ib_device *ibdev)
+{
+   if (!iu)
+   return;
+
+   ib_dma_unmap_single(ibdev, iu->dma_addr, iu->size, dir);
+   kfree(iu->buf);
+   kfree(iu);
+}
+EXPORT_SYMBOL_GPL(ibtrs_iu_free);
+
+int ibtrs_iu_post_recv(struct ibtrs_con *con, struct ibtrs_iu *iu)
+{
+   struct ibtrs_sess *sess = con->sess;
+   struct ib_recv_wr wr, *bad_wr;
+   struct ib_sge list;
+
+   list.addr   = iu->dma_addr;
+   list.length = iu->size;
+   list.lkey   = sess->dev->ib_pd->local_dma_lkey;
+
+   if (WARN_ON(list.length == 0)) {
+   ibtrs_wrn(con, "Posting receive work request failed,"
+ " sg list is empty\n");
+   return -EINVAL;
+   }
+
+   wr.next= NULL;
+   wr.wr_cqe  = &iu->cqe;
+   wr.sg_list = &list;
+   wr.num_sge = 1;
+
+   return ib_post_recv(con->qp, &wr, &bad_wr);
+}
+EXPORT_SYMBOL_GPL(ibtrs_iu_post_recv);
+
+int ibtrs_post_recv_empty(struct ibtrs_con *con, struct ib_cqe *cqe)
+{
+   struct ib_recv_wr wr, *bad_wr;
+
+   wr.next= NULL;
+   wr.wr_cqe  = cqe;
+   wr.sg_list = NULL;
+   wr.num_sge = 0;
+
+   return ib_post_recv(con->qp, &wr, &bad_wr);
+}
+EXPORT_SYMBOL_GPL(ibtrs_post_recv_empty);
+
+int ibtrs_post_recv_empty_x2(struct ibtrs_con *con, struct ib_cqe *cqe)
+{
+   struct ib_recv_wr wr_arr[2], *wr, *bad_wr;
+   int i;
+
+   memset(wr_arr, 0, sizeof(wr_arr));
+   for (i = 0; i < ARRAY_SIZE(wr_arr); i++) {
+   wr = &wr_arr[i];
+   wr->wr_cqe  = cqe;
+   if (i)
+   /* Chain backwards */
+   wr->next = &wr_arr[i - 1];
+   }
+
+   return ib_post_recv(con->qp, wr, &bad_wr);
+}
+EXPORT_SYMBOL_GPL(ibtrs_post_recv_empty_x2);
+
+int ibtrs_iu_

blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread 胡海

Author: huhai 
Date:   Fri May 18 17:09:56 2018 +0800

blk-mq: make sure that correct hctx->dispatch_from is set

When the number of hardware queues is changed, the drivers will call
blk_mq_update_nr_hw_queues() to remap hardware queues, and then
the ctx mapped on hctx will also change, but the current code forgets to
make sure that correct hctx->dispatch_from is set, and hctx->dispatch_from
may point to a ctx that does not belong to the current hctx.

Signed-off-by: huhai 

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2545081..55d8a3d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2214,6 +2214,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
 
+   hctx->dispatch_from = NULL;
+
/*
 * Set the map size to the number of mapped software queues.
 * This is more accurate and more efficient than looping

Re: [PATCH V6 11/11] nvme: pci: support nested EH

2018-05-18 Thread Keith Busch

On Fri, May 18, 2018 at 08:20:05AM +0800, Ming Lei wrote:
> What I think block/011 is helpful is that it can trigger IO timeout
> during reset, which can be triggered in reality too.

As I mentioned earlier, there is nothing wrong with the spirit of
the test. What's wrong with it is the misguided implemention.

Do you underestand why it ever passes? The success happens when the
enabling part of the loop happens to coincide with the driver's enabling,
creating the pci_dev->enable_cnt > 1, making subsequent disable parts
of the loop do absolutely nothing; the exact same as the one-liner
(non-serious) patch I sent to defeat the test.

A better way to induce the timeout is:

  # setpci -s  4.w=0:6

This will halt the device without messing with the kernel structures,
just like how a real device failure would occur.

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Ming Lei

On Fri, May 18, 2018 at 9:42 PM, 胡海  wrote:
> Author: huhai 
> Date:   Fri May 18 17:09:56 2018 +0800
>
> blk-mq: make sure that correct hctx->dispatch_from is set
>
> When the number of hardware queues is changed, the drivers will call
> blk_mq_update_nr_hw_queues() to remap hardware queues, and then
> the ctx mapped on hctx will also change, but the current code forgets to
> make sure that correct hctx->dispatch_from is set, and hctx->dispatch_from
> may point to a ctx that does not belong to the current hctx.
>
> Signed-off-by: huhai 
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2545081..55d8a3d 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2214,6 +2214,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
> hctx->tags = set->tags[i];
> WARN_ON(!hctx->tags);
>
> +   hctx->dispatch_from = NULL;
> +
> /*
>  * Set the map size to the number of mapped software queues.
>  * This is more accurate and more efficient than looping

Good catch,

Reviewed-by: Ming Lei 

Thanks,
Ming Lei

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Jens Axboe

On 5/18/18 7:42 AM, 胡海 wrote:
> Author: huhai 
> Date:   Fri May 18 17:09:56 2018 +0800
> 
> blk-mq: make sure that correct hctx->dispatch_from is set
> 
> When the number of hardware queues is changed, the drivers will call
> blk_mq_update_nr_hw_queues() to remap hardware queues, and then
> the ctx mapped on hctx will also change, but the current code forgets to
> make sure that correct hctx->dispatch_from is set, and hctx->dispatch_from
> may point to a ctx that does not belong to the current hctx.

Looks good, thanks. One minor note for future patches - for cases like this,
when the patch fixes an issue with a specific commit, add a fixes line.
For this one, it would be:

Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")

-- 
Jens Axboe

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Jens Axboe

On 5/18/18 8:27 AM, Jens Axboe wrote:
> On 5/18/18 7:42 AM, 胡海 wrote:
>> Author: huhai 
>> Date:   Fri May 18 17:09:56 2018 +0800
>>
>> blk-mq: make sure that correct hctx->dispatch_from is set
>> 
>> When the number of hardware queues is changed, the drivers will call
>> blk_mq_update_nr_hw_queues() to remap hardware queues, and then
>> the ctx mapped on hctx will also change, but the current code forgets to
>> make sure that correct hctx->dispatch_from is set, and 
>> hctx->dispatch_from
>> may point to a ctx that does not belong to the current hctx.
> 
> Looks good, thanks. One minor note for future patches - for cases like this,
> when the patch fixes an issue with a specific commit, add a fixes line.
> For this one, it would be:
> 
> Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")

Two more notes... Your patches are still coming through as base64 encoded,
they should just be plain text.

Finally, I think the below is much clearer, since that's the loop where
we clear any existing hctx context.


diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6c6aef44badd..4cbfd784e837 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2358,6 +2358,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
+   hctx->dispatch_from = NULL;
}
 
/*

-- 
Jens Axboe

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread huhai

Yes, it is more readable

Finally, thank you for reminding me. Next time I'll change gmail to submit 
patch.

 
 
 
-- Original --
From:  "Jens Axboe";
Date:  Fri, May 18, 2018 10:31 PM
To:  "胡海";
Cc:  "ming.lei"; 
"linux-block";
Subject:  Re: blk-mq: make sure that correct hctx->dispatch_from is set
 
On 5/18/18 8:27 AM, Jens Axboe wrote:
> On 5/18/18 7:42 AM, 胡海 wrote:
>> Author: huhai 
>> Date:   Fri May 18 17:09:56 2018 +0800
>>
>> blk-mq: make sure that correct hctx->dispatch_from is set
>> 
>> When the number of hardware queues is changed, the drivers will call
>> blk_mq_update_nr_hw_queues() to remap hardware queues, and then
>> the ctx mapped on hctx will also change, but the current code forgets to
>> make sure that correct hctx->dispatch_from is set, and 
>> hctx->dispatch_from
>> may point to a ctx that does not belong to the current hctx.
> 
> Looks good, thanks. One minor note for future patches - for cases like this,
> when the patch fixes an issue with a specific commit, add a fixes line.
> For this one, it would be:
> 
> Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")

Two more notes... Your patches are still coming through as base64 encoded,
they should just be plain text.

Finally, I think the below is much clearer, since that's the loop where
we clear any existing hctx context.


diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6c6aef44badd..4cbfd784e837 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2358,6 +2358,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
+   hctx->dispatch_from = NULL;
}
 
/*

-- 
Jens Axboe

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Jens Axboe

On 5/18/18 8:46 AM, huhai wrote:
> Yes, it is more readable

Final version:

http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.18/block&id=d416c92c5d6229b33f37f0f75e52194081ccbcc4

> Finally, thank you for reminding me. Next time I'll change gmail to submit 
> patch.

Not sure gmail can ever really work. You should not top-post reply to
postings either. If you want to experiment with getting a mailer setup
and whether or not it does the right thing, feel free to send a patch
to my email privately, and I can let you know if the end result is
as it should be.

-- 
Jens Axboe

[GIT PULL] Single block fix for 4.17-rc6

2018-05-18 Thread Jens Axboe

Hi Linus,

Single fix this time, from Coly, fixing a failure case when
CONFIG_DEBUGFS isn't enabled.

Please pull!


  git://git.kernel.dk/linux-block.git tags/for-linus-20180518



Coly Li (1):
  bcache: return 0 from bch_debug_init() if CONFIG_DEBUG_FS=n

 drivers/md/bcache/debug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

-- 
Jens Axboe

Re: [PATCH v4 0/3] AIO add per-command iopriority

2018-05-18 Thread Adam Manzanares



On 5/17/18 7:41 PM, Jens Axboe wrote:
> On 5/17/18 2:38 PM, adam.manzana...@wdc.com wrote:
>> From: Adam Manzanares 
>>
>> This is the per-I/O equivalent of the ioprio_set system call.
>> See the following link for performance implications on a SATA HDD:
>> https://lkml.org/lkml/2016/12/6/495
>>
>> First patch factors ioprio_check_cap function out of ioprio_set system call 
>> to
>> also be used by the aio ioprio interface.
>>
>> Second patch converts kiocb ki_hint field to a u16 to avoid kiocb bloat.
>>
>> Third patch passes ioprio hint from aio iocb to kiocb and enables block_dev
>> usage of the per I/O ioprio feature.
>>
>> v2: merge patches
>>  use IOCB_FLAG_IOPRIO
>>  validate intended use with IOCB_IOPRIO
>>  add linux-api and linux-block to cc
>>
>> v3: add ioprio_check_cap function
>>  convert kiocb ki_hint to u16
>>  use ioprio_check_cap when adding ioprio to kiocb in aio.c
>>
>> v4: handle IOCB_IOPRIO in aio_prep_rw
>>  note patch 3 depends on patch 1 in commit msg
>>
>> Adam Manzanares (3):
>>block: add ioprio_check_cap function
>>fs: Convert kiocb rw_hint from enum to u16
>>fs: Add aio iopriority support for block_dev
>>
>>   block/ioprio.c   | 22 --
>>   fs/aio.c | 16 
>>   fs/block_dev.c   |  2 ++
>>   include/linux/fs.h   | 17 +++--
>>   include/linux/ioprio.h   |  2 ++
>>   include/uapi/linux/aio_abi.h |  1 +
>>   6 files changed, 52 insertions(+), 8 deletions(-)
> 
> This looks fine to me now. I can pick up #1 for 4.18 - and 2+3 as well,
> unless someone else wants to take them.

Great, thanks Jens.

>

Re: [PATCH v2 02/26] sysfs: export sysfs_remove_file_self()

2018-05-18 Thread Tejun Heo

On Fri, May 18, 2018 at 03:03:49PM +0200, Roman Pen wrote:
> Function is going to be used in transport over RDMA module
> in subsequent patches.
> 
> Signed-off-by: Roman Pen 
> Cc: Tejun Heo 
> Cc: linux-ker...@vger.kernel.org

Acked-by: Tejun Heo 

Please feel free to apply with other patches.

Thanks.

-- 
tejun

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Bart Van Assche

On Fri, 2018-05-18 at 22:46 +0800, huhai wrote:
> Yes, it is more readable
> 
> Finally, thank you for reminding me. Next time I'll change gmail to submit 
> patch.

Hello Huhai,

Please have a look at Documentation/process/email-clients.rst.

Thanks,

Bart.

Re: [PATCH 00/10] Misc block layer patches for bcachefs

2018-05-18 Thread Bart Van Assche

On Fri, 2018-05-18 at 05:06 -0400, Kent Overstreet wrote:
> On Thu, May 17, 2018 at 08:54:57PM +, Bart Van Assche wrote:
> > With Jens' latest for-next branch I hit the kernel warning shown below. Can
> > you have a look?
> 
> Any hints on how to reproduce it?

Sure. This is how I triggered it:
* Clone https://github.com/bvanassche/srp-test.
* Follow the instructions in README.md.
* Run srp-test/run_tests -c -r 10

Thanks,

Bart.

Re: [PATCH v4 3/3] fs: Add aio iopriority support for block_dev

2018-05-18 Thread Jens Axboe

On 5/17/18 2:38 PM, adam.manzana...@wdc.com wrote:
> From: Adam Manzanares 
> 
> This is the per-I/O equivalent of the ioprio_set system call.
> 
> When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
> newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.
> 
> When a bio is created for an aio request by the block dev we set the priority
> value of the bio to the user supplied value.
> 
> This patch depends on block: add ioprio_check_cap function

Actually, one comment on this one:

> diff --git a/fs/aio.c b/fs/aio.c
> index f3eae5d5771b..ff3107aa82d5 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1451,6 +1451,22 @@ static int aio_prep_rw(struct kiocb *req, struct iocb 
> *iocb)
>   if (iocb->aio_flags & IOCB_FLAG_RESFD)
>   req->ki_flags |= IOCB_EVENTFD;
>   req->ki_hint = file_write_hint(req->ki_filp);
> + if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
> + /*
> +  * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
> +  * aio_reqprio is interpreted as an I/O scheduling
> +  * class and priority.
> +  */
> + ret = ioprio_check_cap(iocb->aio_reqprio);
> + if (ret) {
> + pr_debug("aio ioprio check cap error\n");
> + return -EINVAL;
> + }
> +
> + req->ki_ioprio = iocb->aio_reqprio;
> + req->ki_flags |= IOCB_IOPRIO;
> + }

Do we really need IOCB_IOPRIO? All zeroes is no priority set anyway,
so we should be able to get by with just setting ->ki_ioprio to either
the priority, or 0.

> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 7ec920e27065..970bef79caa6 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -355,6 +355,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
> *iter, int nr_pages)
>   bio->bi_write_hint = iocb->ki_hint;
>   bio->bi_private = dio;
>   bio->bi_end_io = blkdev_bio_end_io;
> + if (iocb->ki_flags & IOCB_IOPRIO)
> + bio->bi_ioprio = iocb->ki_ioprio;

And then this assignment can just happen unconditionally.

-- 
Jens Axboe

Re: blk-mq: make sure that correct hctx->dispatch_from is set

2018-05-18 Thread Jens Axboe

On 5/18/18 9:10 AM, Bart Van Assche wrote:
> On Fri, 2018-05-18 at 22:46 +0800, huhai wrote:
>> Yes, it is more readable
>>
>> Finally, thank you for reminding me. Next time I'll change gmail to submit 
>> patch.
> 
> Hello Huhai,
> 
> Please have a look at Documentation/process/email-clients.rst.

Yeah, I did point at that one too.

For sending out patches, I would strongly recommend just using git send-email.
It works fine with gmail, that's what I always use.

$ cat ~/.gitconfig
[sendemail]
from = Jens Axboe 
smtpserver = smtp.gmail.com
smtpuser = ax...@kernel.dk
smtpencryption = tls
smtppass = 
smtpserverport = 587

-- 
Jens Axboe

Re: [PATCH v4 3/3] fs: Add aio iopriority support for block_dev

2018-05-18 Thread Adam Manzanares



On 5/18/18 8:14 AM, Jens Axboe wrote:
> On 5/17/18 2:38 PM, adam.manzana...@wdc.com wrote:
>> From: Adam Manzanares 
>>
>> This is the per-I/O equivalent of the ioprio_set system call.
>>
>> When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
>> newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.
>>
>> When a bio is created for an aio request by the block dev we set the priority
>> value of the bio to the user supplied value.
>>
>> This patch depends on block: add ioprio_check_cap function
> 
> Actually, one comment on this one:
> 
>> diff --git a/fs/aio.c b/fs/aio.c
>> index f3eae5d5771b..ff3107aa82d5 100644
>> --- a/fs/aio.c
>> +++ b/fs/aio.c
>> @@ -1451,6 +1451,22 @@ static int aio_prep_rw(struct kiocb *req, struct iocb 
>> *iocb)
>>  if (iocb->aio_flags & IOCB_FLAG_RESFD)
>>  req->ki_flags |= IOCB_EVENTFD;
>>  req->ki_hint = file_write_hint(req->ki_filp);
>> +if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
>> +/*
>> + * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
>> + * aio_reqprio is interpreted as an I/O scheduling
>> + * class and priority.
>> + */
>> +ret = ioprio_check_cap(iocb->aio_reqprio);
>> +if (ret) {
>> +pr_debug("aio ioprio check cap error\n");
>> +return -EINVAL;
>> +}
>> +
>> +req->ki_ioprio = iocb->aio_reqprio;
>> +req->ki_flags |= IOCB_IOPRIO;
>> +}
> 
> Do we really need IOCB_IOPRIO? All zeroes is no priority set anyway,
> so we should be able to get by with just setting ->ki_ioprio to either
> the priority, or 0.
> 
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 7ec920e27065..970bef79caa6 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -355,6 +355,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
>> *iter, int nr_pages)
>>  bio->bi_write_hint = iocb->ki_hint;
>>  bio->bi_private = dio;
>>  bio->bi_end_io = blkdev_bio_end_io;
>> +if (iocb->ki_flags & IOCB_IOPRIO)
>> +bio->bi_ioprio = iocb->ki_ioprio;
> 
> And then this assignment can just happen unconditionally.

That is a cleaner way of guaranteeing the ioprio set on the kiocb is 
only set when the user intends to use the ioprio from the iocb.

I'll resend the series.


>

Re: [PATCH v4 1/3] block: add ioprio_check_cap function

2018-05-18 Thread Christoph Hellwig

On Thu, May 17, 2018 at 01:38:01PM -0700, adam.manzana...@wdc.com wrote:
> From: Adam Manzanares 
> 
> Aio per command iopriority support introduces a second interface between
> userland and the kernel capable of passing iopriority. The aio interface also
> needs the ability to verify that the submitting context has sufficient
> priviledges to submit IOPRIO_RT commands. This patch creates the
> ioprio_check_cap function to be used by the ioprio_set system call and also by
> the aio interface.
> 
> Signed-off-by: Adam Manzanares 

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH v4 2/3] fs: Convert kiocb rw_hint from enum to u16

2018-05-18 Thread Christoph Hellwig

> +/* ki_hint changed from enum to u16, make sure rw_hint fits into u16 */

I don't think this comment is very useful.

> +static inline u16 ki_hint_valid(enum rw_hint hint)

I'd call this ki_hint_validate.

> +{
> + if (hint > MAX_KI_HINT)
> + return 0;
> +
> + return hint;

Nit: kill the empty line.

Re: [PATCH v4 3/3] fs: Add aio iopriority support for block_dev

2018-05-18 Thread Christoph Hellwig

Looks fine, although I'd split it into a aio and block_dev patch.

Also please wire this up for the fs/iomap.c direct I/O code, it should
be essentially the same sniplet as in the block_dev.c code.

Re: [PATCH 02/10] block: Convert bio_set to mempool_init()

2018-05-18 Thread Christoph Hellwig

On Tue, May 08, 2018 at 09:33:50PM -0400, Kent Overstreet wrote:
> Minor performance improvement by getting rid of pointer indirections
> from allocation/freeing fastpaths.

Can you please also send a long conversion for the remaining
few bioset_create users?  It would be rather silly to keep two
almost the same interfaces around for just about two hand full
of users.

Re: [PATCH 02/10] block: Convert bio_set to mempool_init()

2018-05-18 Thread Christoph Hellwig

On Fri, May 18, 2018 at 09:20:28AM -0700, Christoph Hellwig wrote:
> On Tue, May 08, 2018 at 09:33:50PM -0400, Kent Overstreet wrote:
> > Minor performance improvement by getting rid of pointer indirections
> > from allocation/freeing fastpaths.
> 
> Can you please also send a long conversion for the remaining
> few bioset_create users?  It would be rather silly to keep two
> almost the same interfaces around for just about two hand full
> of users.

This comment was ment in reply to the next patch, sorry.

Re: [PATCH 00/10] Misc block layer patches for bcachefs

2018-05-18 Thread Christoph Hellwig

On Fri, May 11, 2018 at 03:13:38PM -0600, Jens Axboe wrote:
> Looked over the series, and looks like both good cleanups and optimizations.
> If we can get the mempool patch sorted, I can apply this for 4.18.

FYI, I agree on the actual cleanups and optimization, but we really
shouldn't add new functions or even just exports without the code
using them.  I think it is enough if we can collect ACKs on them, but
there is no point in using them.  Especially as I'd really like to see
the users for some of them first.

Re: [PATCH V6 11/11] nvme: pci: support nested EH

2018-05-18 Thread Keith Busch

On Thu, May 17, 2018 at 04:23:45PM +0200, Johannes Thumshirn wrote:
> > Agreed. Alternatively possibly call the driver's reset_preparei/done
> > callbacks.
> 
> Exactly, but as long as we can issue the reset via sysfs the test-case
> is still valid.

I disagree the test case is valid. The test writes '0' to the
pci-sysfs 'enable', but the driver also disables the pci device as part
of resetting, which is a perfectly reasonable thing for a driver to do.

If the timing of the test's loop happens to write '0' right after the
driver disabled the device that it owns, a 'write error' on that sysfs
write occurs, and blktests then incorrectly claims the test failed.

Re: [PATCH 00/10] Misc block layer patches for bcachefs

2018-05-18 Thread Jens Axboe

On 5/18/18 10:23 AM, Christoph Hellwig wrote:
> On Fri, May 11, 2018 at 03:13:38PM -0600, Jens Axboe wrote:
>> Looked over the series, and looks like both good cleanups and optimizations.
>> If we can get the mempool patch sorted, I can apply this for 4.18.
> 
> FYI, I agree on the actual cleanups and optimization, but we really
> shouldn't add new functions or even just exports without the code
> using them.  I think it is enough if we can collect ACKs on them, but
> there is no point in using them.  Especially as I'd really like to see
> the users for some of them first.

I certainly agree on that in general, but at the same time it makes the
expected submission of bcachefs not having to carry a number of
(essentially) unrelated patches. I'm assuming the likelihood of bcachefs
being submitted soonish is high, hence we won't have exports that don't
have in-kernel users in the longer term.

-- 
Jens Axboe

[PATCH 4/6] nvme: Allow reset from CONNECTING state

2018-05-18 Thread Keith Busch

A failed connection may be retryable. This patch allows the connecting
state to initiate a reset so that it may try to connect again.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 34d7731f1419..bccc92206fba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -293,6 +293,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_ADMIN_ONLY:
+   case NVME_CTRL_CONNECTING:
changed = true;
/* FALLTHRU */
default:
-- 
2.14.3

[PATCH 6/6] nvme-pci: Rate limit the nvme timeout warnings

2018-05-18 Thread Keith Busch

The block layer's timeout handling currently refuses to let the driver
complete commands outside the timeout callback once blk-mq decides they've
expired. If a device breaks, this could potentially create many thousands
of timed out commands. There's nothing of value to be gleaned from
observing each of those messages, so this patch adds a ratelimit on them.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddfeb186d129..e4b91c246e36 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1251,7 +1251,7 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
 * returned to the driver, or if this is the admin queue.
 */
if (!nvmeq->qid || iod->aborted) {
-   dev_warn(dev->ctrl.device,
+   dev_warn_ratelimited(dev->ctrl.device,
 "I/O %d QID %d timeout, reset controller\n",
 req->tag, nvmeq->qid);
nvme_dev_disable(dev, false);
-- 
2.14.3

[PATCH 5/6] nvme-pci: Attempt reset retry for IO failures

2018-05-18 Thread Keith Busch

If the reset failed due to a non-fatal error, this patch will attempt
to reset the controller again, with a maximum of 4 attempts.

Since the failed reset case has changed purpose, this patch provides a
more appropriate name and warning message for the reset failure.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a7cbc631d92..ddfeb186d129 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -37,6 +37,8 @@
 
 #define SGES_PER_PAGE  (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
+#define MAX_RESET_FAILURES 4
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -101,6 +103,8 @@ struct nvme_dev {
struct completion ioq_wait;
bool queues_froze;
 
+   int reset_failures;
+
/* shadow doorbell buffer support: */
u32 *dbbuf_dbs;
dma_addr_t dbbuf_dbs_dma_addr;
@@ -2307,9 +2311,23 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
kfree(dev);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+static void nvme_reset_failure(struct nvme_dev *dev, int status)
 {
-   dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", 
status);
+   dev->reset_failures++;
+   dev_warn(dev->ctrl.device, "Reset failure status: %d, failures:%d\n",
+   status, dev->reset_failures);
+
+   /* IO and Interrupted Call may indicate a retryable error */
+   switch (status) {
+   case -EIO:
+   case -EINTR:
+   if (dev->reset_failures < MAX_RESET_FAILURES &&
+   !nvme_reset_ctrl(&dev->ctrl))
+   return;
+   break;
+   default:
+   break;
+   }
 
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
@@ -2410,14 +2428,16 @@ static void nvme_reset_work(struct work_struct *work)
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
dev_warn(dev->ctrl.device,
"failed to mark controller state %d\n", new_state);
+   result = -ENODEV;
goto out;
}
 
+   dev->reset_failures = 0;
nvme_start_ctrl(&dev->ctrl);
return;
 
  out:
-   nvme_remove_dead_ctrl(dev, result);
+   nvme_reset_failure(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-- 
2.14.3

[PATCH 3/6] nvme: Move all IO out of controller reset

2018-05-18 Thread Keith Busch

IO may be retryable, so don't wait for them in the reset path. These
commands may trigger a reset if that IO expires without a completion,
placing it on the requeue list. Waiting for these would then deadlock
the reset handler.

To fix the theoretical deadlock, this patch unblocks IO submission from
the reset_work as before, but moves the waiting to the IO safe scan_work
so that the reset_work may proceed to completion. Since the unfreezing
happens in the controller LIVE state, the nvme device has to track if
the queues were frozen now to prevent incorrect freeze depths.

This patch is also renaming the function 'nvme_dev_add' to a
more appropriate name that describes what it's actually doing:
nvme_alloc_io_tags.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/core.c |  3 +++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 46 +-
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1de68b56b318..34d7731f1419 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
if (blk_noretry_request(req))
return false;
if (nvme_req(req)->status & NVME_SC_DNR)
+
return false;
if (nvme_req(req)->retries >= nvme_max_retries)
return false;
@@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
struct nvme_id_ctrl *id;
unsigned nn;
 
+   if (ctrl->ops->update_hw_ctx)
+   ctrl->ops->update_hw_ctx(ctrl);
if (ctrl->state != NVME_CTRL_LIVE)
return;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c15c2ee7f61a..230c5424b197 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
int (*reinit_request)(void *data, struct request *rq);
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+   void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2bd9d84f58d0..6a7cbc631d92 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -99,6 +99,7 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
struct completion ioq_wait;
+   bool queues_froze;
 
/* shadow doorbell buffer support: */
u32 *dbbuf_dbs;
@@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
}
 }
 
+static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
+{
+   struct nvme_dev *dev = to_nvme_dev(ctrl);
+   bool unfreeze;
+
+   mutex_lock(&dev->shutdown_lock);
+   unfreeze = dev->queues_froze;
+   mutex_unlock(&dev->shutdown_lock);
+
+   if (unfreeze)
+   nvme_wait_freeze(&dev->ctrl);
+
+   blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
+   nvme_free_queues(dev, dev->online_queues);
+
+   if (unfreeze)
+   nvme_unfreeze(&dev->ctrl);
+
+   mutex_lock(&dev->shutdown_lock);
+   dev->queues_froze = false;
+   mutex_unlock(&dev->shutdown_lock);
+}
+
 /*
  * return error value only when tagset allocation failed
  */
-static int nvme_dev_add(struct nvme_dev *dev)
+static int nvme_alloc_io_tags(struct nvme_dev *dev)
 {
int ret;
 
@@ -2097,10 +2121,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
nvme_dbbuf_set(dev);
} else {
-   blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 
1);
-
-   /* Free previously allocated queues that are no longer usable */
-   nvme_free_queues(dev, dev->online_queues);
+   nvme_start_queues(&dev->ctrl);
}
 
return 0;
@@ -2201,7 +,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
 dev->ctrl.state == NVME_CTRL_RESETTING)) {
u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-   nvme_start_freeze(&dev->ctrl);
+   if (!dev->queues_froze) {
+   nvme_start_freeze(&dev->ctrl);
+   dev->queues_froze = true;
+   }
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pci_channel_offline(pdev) || !pci_is_enabled(pdev));
}
@@ -2375,13 +2399,8 @@ static void nvme_reset_work(struct work_struct *work)
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
new_state = NVME_CTRL_ADMIN_ONLY;
-   } else {
-   nvme_start_queues(&dev->ctrl);
-   nvme_wait_freeze(&dev->ctrl);
-   /* hit this only when allocate tagset fails */
-   if (nvme_dev_add(dev))
-

[PATCH 1/6] nvme: Sync request queues on reset

2018-05-18 Thread Keith Busch

This patch fixes races that occur with simultaneous controller
resets by synchronizing request queues prior to initializing the
controller. Withouth this, a thread may attempt disabling a controller
at the same time as we're trying to enable it.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/core.c | 21 +++--
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 99b857e5a7a9..1de68b56b318 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3471,6 +3471,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device 
*dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
+static void nvme_start_queue(struct nvme_ns *ns)
+{
+   blk_mq_unquiesce_queue(ns->queue);
+   blk_mq_kick_requeue_list(ns->queue);
+}
+
 /**
  * nvme_kill_queues(): Ends all namespace queues
  * @ctrl: the dead controller that needs to end
@@ -3499,7 +3505,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
blk_set_queue_dying(ns->queue);
 
/* Forcibly unquiesce queues to avoid blocking dispatch */
-   blk_mq_unquiesce_queue(ns->queue);
+   nvme_start_queue(ns);
}
up_read(&ctrl->namespaces_rwsem);
 }
@@ -3569,11 +3575,22 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
-   blk_mq_unquiesce_queue(ns->queue);
+   nvme_start_queue(ns);
up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+   struct nvme_ns *ns;
+
+   down_read(&ctrl->namespaces_rwsem);
+   list_for_each_entry(ns, &ctrl->namespaces, list)
+   blk_sync_queue(ns->queue);
+   up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
 {
if (!ctrl->ops->reinit_request)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 17d2f7cf3fed..c15c2ee7f61a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -407,6 +407,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void 
*buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17a0190bd88f..8da63402d474 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2312,6 +2312,7 @@ static void nvme_reset_work(struct work_struct *work)
 */
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
+   nvme_sync_queues(&dev->ctrl);
 
/*
 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-- 
2.14.3

[PATCH 2/6] nvme-pci: Fix queue freeze criteria on reset

2018-05-18 Thread Keith Busch

The driver had been relying on the pci_dev to maintain the state of
the pci device to know when starting a freeze would be appropriate. The
blktests block/011 however shows us that users may alter the state of
pci_dev out from under drivers and break the criteria we had been using.

This patch uses the private nvme controller struct to track the
enabling/disabling state. Since we're relying on that now, the reset will
unconditionally disable the device on reset. This is necessary anyway
on a controller failure reset, and was already being done in the reset
during admin bring up, and is not harmful to do a second time.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8da63402d474..2bd9d84f58d0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2196,24 +2196,22 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
struct pci_dev *pdev = to_pci_dev(dev->dev);
 
mutex_lock(&dev->shutdown_lock);
-   if (pci_is_enabled(pdev)) {
+   if (dev->ctrl.ctrl_config & NVME_CC_ENABLE &&
+   (dev->ctrl.state == NVME_CTRL_LIVE ||
+dev->ctrl.state == NVME_CTRL_RESETTING)) {
u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-   if (dev->ctrl.state == NVME_CTRL_LIVE ||
-   dev->ctrl.state == NVME_CTRL_RESETTING)
-   nvme_start_freeze(&dev->ctrl);
+   nvme_start_freeze(&dev->ctrl);
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
-   pdev->error_state  != pci_channel_io_normal);
+   pci_channel_offline(pdev) || !pci_is_enabled(pdev));
}
 
/*
 * Give the controller a chance to complete all entered requests if
 * doing a safe shutdown.
 */
-   if (!dead) {
-   if (shutdown)
-   nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-   }
+   if (!dead && shutdown)
+   nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
nvme_stop_queues(&dev->ctrl);
 
@@ -2227,8 +2225,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool 
shutdown)
if (dev->host_mem_descs)
nvme_set_host_mem(dev, 0);
nvme_disable_io_queues(dev);
-   nvme_disable_admin_queue(dev, shutdown);
}
+   nvme_disable_admin_queue(dev, shutdown);
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
nvme_suspend_queue(&dev->queues[i]);
 
-- 
2.14.3

[PATCH 02/34] fs: factor out a __generic_write_end helper

2018-05-18 Thread Christoph Hellwig

Bits of the buffer.c based write_end implementations that don't know
about buffer_heads and can be reused by other implementations.

Signed-off-by: Christoph Hellwig 
---
 fs/buffer.c   | 67 +++
 fs/internal.h |  2 ++
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 249b83fafe48..bd964b2ad99a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2076,6 +2076,40 @@ int block_write_begin(struct address_space *mapping, 
loff_t pos, unsigned len,
 }
 EXPORT_SYMBOL(block_write_begin);
 
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+   struct page *page)
+{
+   loff_t old_size = inode->i_size;
+   bool i_size_changed = false;
+
+   /*
+* No need to use i_size_read() here, the i_size cannot change under us
+* because we hold i_rwsem.
+*
+* But it's important to update i_size while still holding page lock:
+* page writeout could otherwise come in and zero beyond i_size.
+*/
+   if (pos + copied > inode->i_size) {
+   i_size_write(inode, pos + copied);
+   i_size_changed = true;
+   }
+
+   unlock_page(page);
+   put_page(page);
+
+   if (old_size < pos)
+   pagecache_isize_extended(inode, old_size, pos);
+   /*
+* Don't mark the inode dirty under page lock. First, it unnecessarily
+* makes the holding time of page lock longer. Second, it forces lock
+* ordering of page lock and transaction start for journaling
+* filesystems.
+*/
+   if (i_size_changed)
+   mark_inode_dirty(inode);
+   return copied;
+}
+
 int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -2116,39 +2150,8 @@ int generic_write_end(struct file *file, struct 
address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
 {
-   struct inode *inode = mapping->host;
-   loff_t old_size = inode->i_size;
-   int i_size_changed = 0;
-
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-   /*
-* No need to use i_size_read() here, the i_size
-* cannot change under us because we hold i_mutex.
-*
-* But it's important to update i_size while still holding page lock:
-* page writeout could otherwise come in and zero beyond i_size.
-*/
-   if (pos+copied > inode->i_size) {
-   i_size_write(inode, pos+copied);
-   i_size_changed = 1;
-   }
-
-   unlock_page(page);
-   put_page(page);
-
-   if (old_size < pos)
-   pagecache_isize_extended(inode, old_size, pos);
-   /*
-* Don't mark the inode dirty under page lock. First, it unnecessarily
-* makes the holding time of page lock longer. Second, it forces lock
-* ordering of page lock and transaction start for journaling
-* filesystems.
-*/
-   if (i_size_changed)
-   mark_inode_dirty(inode);
-
-   return copied;
+   return __generic_write_end(mapping->host, pos, copied, page);
 }
 EXPORT_SYMBOL(generic_write_end);
 
diff --git a/fs/internal.h b/fs/internal.h
index e08972db0303..b955232d3d49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,6 +43,8 @@ static inline int __sync_blockdev(struct block_device *bdev, 
int wait)
 extern void guard_bio_eod(int rw, struct bio *bio);
 extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+   struct page *page);
 
 /*
  * char_dev.c
-- 
2.17.0

[PATCH 03/34] fs: move page_cache_seek_hole_data to iomap.c

2018-05-18 Thread Christoph Hellwig

This function is only used by the iomap code, depends on being called
from it, and will soon stop poking into buffer head internals.

Signed-off-by: Christoph Hellwig 
---
 fs/buffer.c | 114 ---
 fs/iomap.c  | 116 
 include/linux/buffer_head.h |   2 -
 3 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index bd964b2ad99a..aba2a948b235 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3430,120 +3430,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
-   loff_t offset = page_offset(page);
-   struct buffer_head *bh, *head;
-   bool seek_data = whence == SEEK_DATA;
-
-   if (lastoff < offset)
-   lastoff = offset;
-
-   bh = head = page_buffers(page);
-   do {
-   offset += bh->b_size;
-   if (lastoff >= offset)
-   continue;
-
-   /*
-* Unwritten extents that have data in the page cache covering
-* them can be identified by the BH_Unwritten state flag.
-* Pages with multiple buffers might have a mix of holes, data
-* and unwritten extents - any buffer with valid data in it
-* should have BH_Uptodate flag set on it.
-*/
-
-   if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
-   return lastoff;
-
-   lastoff = offset;
-   } while ((bh = bh->b_this_page) != head);
-   return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
- int whence)
-{
-   pgoff_t index = offset >> PAGE_SHIFT;
-   pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-   loff_t lastoff = offset;
-   struct pagevec pvec;
-
-   if (length <= 0)
-   return -ENOENT;
-
-   pagevec_init(&pvec);
-
-   do {
-   unsigned nr_pages, i;
-
-   nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-   end - 1);
-   if (nr_pages == 0)
-   break;
-
-   for (i = 0; i < nr_pages; i++) {
-   struct page *page = pvec.pages[i];
-
-   /*
-* At this point, the page may be truncated or
-* invalidated (changing page->mapping to NULL), or
-* even swizzled back from swapper_space to tmpfs file
-* mapping.  However, page->index will not change
-* because we have a reference on the page.
- *
-* If current page offset is beyond where we've ended,
-* we've found a hole.
- */
-   if (whence == SEEK_HOLE &&
-   lastoff < page_offset(page))
-   goto check_range;
-
-   lock_page(page);
-   if (likely(page->mapping == inode->i_mapping) &&
-   page_has_buffers(page)) {
-   lastoff = page_seek_hole_data(page, lastoff, 
whence);
-   if (lastoff >= 0) {
-   unlock_page(page);
-   goto check_range;
-   }
-   }
-   unlock_page(page);
-   lastoff = page_offset(page) + PAGE_SIZE;
-   }
-   pagevec_release(&pvec);
-   } while (index < end);
-
-   /* When no page at lastoff and we are not done, we found a hole. */
-   if (whence != SEEK_HOLE)
-   goto not_found;
-
-check_range:
-   if (lastoff < offset + length)
-   goto out;
-not_found:
-   lastoff = -ENOENT;
-out:
-   pagevec_release(&pvec);
-   return lastoff;
-}
-
 void __init buffer_init(void)
 {
unsigned long nrpages;
diff --git a/fs/iomap.c b/fs/iomap.c
index f2456d0d8ddd..4a01d2f4e8e9 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -588,6 +5

[PATCH 09/34] iomap: inline data should be an iomap type, not a flag

2018-05-18 Thread Christoph Hellwig

Inline data is fundamentally different from our normal mapped case in that
it doesn't even have a block address.  So instead of having a flag for it
it should be an entirely separate iomap range type.

Signed-off-by: Christoph Hellwig 
---
 fs/ext4/inline.c  |  4 ++--
 fs/gfs2/bmap.c|  3 +--
 fs/iomap.c| 21 -
 include/linux/iomap.h |  2 +-
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cf4c7b268a..e1f00891ef95 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1835,8 +1835,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct 
iomap *iomap)
iomap->offset = 0;
iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
  i_size_read(inode));
-   iomap->type = 0;
-   iomap->flags = IOMAP_F_DATA_INLINE;
+   iomap->type = IOMAP_INLINE;
+   iomap->flags = 0;
 
 out:
up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 278ed0869c3c..cbeedd3cfb36 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -680,8 +680,7 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct 
iomap *iomap)
  sizeof(struct gfs2_dinode);
iomap->offset = 0;
iomap->length = i_size_read(inode);
-   iomap->type = IOMAP_MAPPED;
-   iomap->flags = IOMAP_F_DATA_INLINE;
+   iomap->type = IOMAP_INLINE;
 }
 
 /**
diff --git a/fs/iomap.c b/fs/iomap.c
index 0fecd5789d7b..a859e15d7bec 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -503,10 +503,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
case IOMAP_DELALLOC:
flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
break;
+   case IOMAP_MAPPED:
+   break;
case IOMAP_UNWRITTEN:
flags |= FIEMAP_EXTENT_UNWRITTEN;
break;
-   case IOMAP_MAPPED:
+   case IOMAP_INLINE:
+   flags |= FIEMAP_EXTENT_DATA_INLINE;
break;
}
 
@@ -514,8 +517,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
-   if (iomap->flags & IOMAP_F_DATA_INLINE)
-   flags |= FIEMAP_EXTENT_DATA_INLINE;
 
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -1326,14 +1327,16 @@ static loff_t iomap_swapfile_activate_actor(struct 
inode *inode, loff_t pos,
struct iomap_swapfile_info *isi = data;
int error;
 
-   /* No inline data. */
-   if (iomap->flags & IOMAP_F_DATA_INLINE) {
+   switch (iomap->type) {
+   case IOMAP_MAPPED:
+   case IOMAP_UNWRITTEN:
+   /* Only real or unwritten extents. */
+   break;
+   case IOMAP_INLINE:
+   /* No inline data. */
pr_err("swapon: file is inline\n");
return -EINVAL;
-   }
-
-   /* Only real or unwritten extents. */
-   if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN) {
+   default:
pr_err("swapon: file has unallocated extents\n");
return -EINVAL;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4bd87294219a..8f7095fc514e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -18,6 +18,7 @@ struct vm_fault;
 #define IOMAP_DELALLOC 0x02/* delayed allocation blocks */
 #define IOMAP_MAPPED   0x03/* blocks allocated at @addr */
 #define IOMAP_UNWRITTEN0x04/* blocks allocated at @addr in 
unwritten state */
+#define IOMAP_INLINE   0x05/* data inline in the inode */
 
 /*
  * Flags for all iomap mappings:
@@ -34,7 +35,6 @@ struct vm_fault;
  */
 #define IOMAP_F_MERGED 0x10/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED 0x20/* block shared with another file */
-#define IOMAP_F_DATA_INLINE0x40/* data inline in the inode */
 
 /*
  * Magic value for addr:
-- 
2.17.0

[PATCH 04/34] fs: remove the buffer_unwritten check in page_seek_hole_data

2018-05-18 Thread Christoph Hellwig

We only call into this function through the iomap iterators, so we already
know the buffer is unwritten.  In addition to that we always require the
uptodate flag that is ORed with the result anyway.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 4a01d2f4e8e9..bef5e91d40bf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -611,14 +611,9 @@ page_seek_hole_data(struct page *page, loff_t lastoff, int 
whence)
continue;
 
/*
-* Unwritten extents that have data in the page cache covering
-* them can be identified by the BH_Unwritten state flag.
-* Pages with multiple buffers might have a mix of holes, data
-* and unwritten extents - any buffer with valid data in it
-* should have BH_Uptodate flag set on it.
+* Any buffer with valid data in it should have BH_Uptodate set.
 */
-
-   if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+   if (buffer_uptodate(bh) == seek_data)
return lastoff;
 
lastoff = offset;
@@ -630,8 +625,8 @@ page_seek_hole_data(struct page *page, loff_t lastoff, int 
whence)
  * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  *
  * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
  *
  * Returns the resulting offset on successs, and -ENOENT otherwise.
  */
-- 
2.17.0

[PATCH 13/34] iomap: add a iomap_sector helper

2018-05-18 Thread Christoph Hellwig

Factor the repeated calculation of the on-disk sector for a given logical
block into a littler helper.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 6427627a247f..44259eadb69d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -97,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, 
unsigned flags,
return written ? written : ret;
 }
 
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+   return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -354,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, 
unsigned offset,
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
 {
-   sector_t sector = (iomap->addr +
-  (pos & PAGE_MASK) - iomap->offset) >> 9;
-
-   return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
-   offset, bytes);
+   return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+   iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
 }
 
 static loff_t
@@ -951,8 +954,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, 
loff_t pos,
 
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
-   bio->bi_iter.bi_sector =
-   (iomap->addr + pos - iomap->offset) >> 9;
+   bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -1046,8 +1048,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t 
length,
 
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
-   bio->bi_iter.bi_sector =
-   (iomap->addr + pos - iomap->offset) >> 9;
+   bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
-- 
2.17.0

[PATCH 07/34] mm: return an unsigned int from __do_page_cache_readahead

2018-05-18 Thread Christoph Hellwig

We never return an error, so switch to returning an unsigned int.  Most
callers already did implicit casts to an unsigned type, and the one that
didn't can be simplified now.

Suggested-by: Matthew Wilcox 
Signed-off-by: Christoph Hellwig 
---
 mm/internal.h  |  2 +-
 mm/readahead.c | 15 +--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..954003ac766a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 unsigned long addr, unsigned long end,
 struct zap_details *details);
 
-extern int __do_page_cache_readahead(struct address_space *mapping,
+extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size);
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 16d0cb1e2616..fa4d4b767130 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -147,16 +147,16 @@ static int read_pages(struct address_space *mapping, 
struct file *filp,
  *
  * Returns the number of pages requested, or the maximum amount of I/O allowed.
  */
-int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-   pgoff_t offset, unsigned long nr_to_read,
-   unsigned long lookahead_size)
+unsigned int __do_page_cache_readahead(struct address_space *mapping,
+   struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+   unsigned long lookahead_size)
 {
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index;/* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
-   int nr_pages = 0;
+   unsigned int nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -223,16 +223,11 @@ int force_page_cache_readahead(struct address_space 
*mapping, struct file *filp,
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
-   int err;
-
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
 
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
-   err = __do_page_cache_readahead(mapping, filp,
-   offset, this_chunk, 0);
-   if (err < 0)
-   return err;
+   __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
 
offset += this_chunk;
nr_to_read -= this_chunk;
-- 
2.17.0

[PATCH 08/34] mm: split ->readpages calls to avoid non-contiguous pages lists

2018-05-18 Thread Christoph Hellwig

That way file systems don't have to go spotting for non-contiguous pages
and work around them.  It also kicks off I/O earlier, allowing it to
finish earlier and reduce latency.

Signed-off-by: Christoph Hellwig 
---
 mm/readahead.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index fa4d4b767130..044ab0c137cc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -177,8 +177,18 @@ unsigned int __do_page_cache_readahead(struct 
address_space *mapping,
rcu_read_lock();
page = radix_tree_lookup(&mapping->i_pages, page_offset);
rcu_read_unlock();
-   if (page && !radix_tree_exceptional_entry(page))
+   if (page && !radix_tree_exceptional_entry(page)) {
+   /*
+* Page already present?  Kick off the current batch of
+* contiguous pages before continuing with the next
+* batch.
+*/
+   if (nr_pages)
+   read_pages(mapping, filp, &page_pool, nr_pages,
+   gfp_mask);
+   nr_pages = 0;
continue;
+   }
 
page = __page_cache_alloc(gfp_mask);
if (!page)
-- 
2.17.0

[PATCH 01/34] block: add a lower-level bio_add_page interface

2018-05-18 Thread Christoph Hellwig

For the upcoming removal of buffer heads in XFS we need to keep track of
the number of outstanding writeback requests per page.  For this we need
to know if bio_add_page merged a region with the previous bvec or not.
Instead of adding additional arguments this refactors bio_add_page to
be implemented using three lower level helpers which users like XFS can
use directly if they care about the merge decisions.

Signed-off-by: Christoph Hellwig 
---
 block/bio.c | 96 +
 include/linux/bio.h |  9 +
 2 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..fdf635d42bbd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio 
*bio, struct page
return 0;
}
 
-   if (bio->bi_vcnt >= bio->bi_max_vecs)
+   if (bio_full(bio))
return 0;
 
/*
@@ -821,52 +821,82 @@ int bio_add_pc_page(struct request_queue *q, struct bio 
*bio, struct page
 EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
- * bio_add_page-   attempt to add page to bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
  *
- * Attempt to add a page to the bio_vec maplist. This will only fail
- * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
+ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
+ * a useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Return %true on success or %false on failure.
  */
-int bio_add_page(struct bio *bio, struct page *page,
-unsigned int len, unsigned int offset)
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+   unsigned int len, unsigned int off)
 {
-   struct bio_vec *bv;
-
-   /*
-* cloned bio must not modify vec list
-*/
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-   return 0;
+   return false;
 
-   /*
-* For filesystems with a blocksize smaller than the pagesize
-* we will often be called with the same page as last time and
-* a consecutive offset.  Optimize this special case.
-*/
if (bio->bi_vcnt > 0) {
-   bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+   struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
-   if (page == bv->bv_page &&
-   offset == bv->bv_offset + bv->bv_len) {
+   if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
bv->bv_len += len;
-   goto done;
+   bio->bi_iter.bi_size += len;
+   return true;
}
}
+   return false;
+}
+EXPORT_SYMBOL_GPL(__bio_try_merge_page);
 
-   if (bio->bi_vcnt >= bio->bi_max_vecs)
-   return 0;
+/**
+ * __bio_add_page - add page to a bio in a new segment
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
+ *
+ * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
+ * that @bio has space for another bvec.
+ */
+void __bio_add_page(struct bio *bio, struct page *page,
+   unsigned int len, unsigned int off)
+{
+   struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
 
-   bv  = &bio->bi_io_vec[bio->bi_vcnt];
-   bv->bv_page = page;
-   bv->bv_len  = len;
-   bv->bv_offset   = offset;
+   WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+   WARN_ON_ONCE(bio_full(bio));
+
+   bv->bv_page = page;
+   bv->bv_offset = off;
+   bv->bv_len = len;
 
-   bio->bi_vcnt++;
-done:
bio->bi_iter.bi_size += len;
+   bio->bi_vcnt++;
+}
+EXPORT_SYMBOL_GPL(__bio_add_page);
+
+/**
+ * bio_add_page-   attempt to add page to bio
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This will only fail
+ * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
+ */
+int bio_add_page(struct bio *bio, struct page *page,
+unsigned int len, unsigned int offset)
+{
+   if (!__bio_try_merge_page(bio, page, len, offset)) {
+   if (bio_full(bio))
+   return 0;
+   __bio_add_page(bio, page, len, offset);
+   }
return len;
 }
 EXPORT_SYMBOL(bio_add_page);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..3e73c8bc25ea 100644
--- a/include/linux/bio.h
+++ b/in

buffered I/O without buffer heads in xfs and iomap v2

2018-05-18 Thread Christoph Hellwig

Hi all,

this series adds support for buffered I/O without buffer heads to
the iomap and XFS code.

For now this series only contains support for block size == PAGE_SIZE,
with the 4k support split into a separate series.


A git tree is available at:

git://git.infradead.org/users/hch/xfs.git xfs-iomap-read.2

Gitweb:


http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/xfs-iomap-read.2

Changes since v1:
 - fix the iomap_readpages error handling
 - use unsigned file offsets in a few places to avoid arithmetic overflows
 - allocate a iomap_page in iomap_page_mkwrite to fix generic/095
 - improve a few comments
 - add more asserts
 - warn about truncated block numbers from ->bmap
 - new patch to change the __do_page_cache_readahead return value to
   unsigned int
 - remove an incorrectly added empty line
 - make inline data an explicit iomap type instead of a flag
 - add a IOMAP_F_BUFFER_HEAD flag to force use of buffers heads for gfs2,
   and keep the basic buffer head infrastructure around for now.

[PATCH 10/34] iomap: fix the comment describing IOMAP_NOWAIT

2018-05-18 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 include/linux/iomap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8f7095fc514e..13d19b4c29a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -59,7 +59,7 @@ struct iomap {
 #define IOMAP_REPORT   (1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT   (1 << 4) /* direct I/O */
-#define IOMAP_NOWAIT   (1 << 5) /* Don't wait for writeback */
+#define IOMAP_NOWAIT   (1 << 5) /* do not block */
 
 struct iomap_ops {
/*
-- 
2.17.0

[PATCH 06/34] mm: give the 'ret' variable a better name __do_page_cache_readahead

2018-05-18 Thread Christoph Hellwig

It counts the number of pages acted on, so name it nr_pages to make that
obvious.

Signed-off-by: Christoph Hellwig 
---
 mm/readahead.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 539bbb6c1fad..16d0cb1e2616 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -156,7 +156,7 @@ int __do_page_cache_readahead(struct address_space 
*mapping, struct file *filp,
unsigned long end_index;/* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
-   int ret = 0;
+   int nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -187,7 +187,7 @@ int __do_page_cache_readahead(struct address_space 
*mapping, struct file *filp,
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
-   ret++;
+   nr_pages++;
}
 
/*
@@ -195,11 +195,11 @@ int __do_page_cache_readahead(struct address_space 
*mapping, struct file *filp,
 * uptodate then the caller will launch readpage again, and
 * will then handle the error.
 */
-   if (ret)
-   read_pages(mapping, filp, &page_pool, ret, gfp_mask);
+   if (nr_pages)
+   read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
BUG_ON(!list_empty(&page_pool));
 out:
-   return ret;
+   return nr_pages;
 }
 
 /*
-- 
2.17.0

[PATCH 17/34] xfs: use iomap_bmap

2018-05-18 Thread Christoph Hellwig

Switch to the iomap based bmap implementation to get rid of one of the
last users of xfs_get_blocks.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 80de476cecf8..56e405572909 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1378,10 +1378,9 @@ xfs_vm_bmap(
struct address_space*mapping,
sector_tblock)
 {
-   struct inode*inode = (struct inode *)mapping->host;
-   struct xfs_inode*ip = XFS_I(inode);
+   struct xfs_inode*ip = XFS_I(mapping->host);
 
-   trace_xfs_vm_bmap(XFS_I(inode));
+   trace_xfs_vm_bmap(ip);
 
/*
 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1394,9 +1393,7 @@ xfs_vm_bmap(
 */
if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
-
-   filemap_write_and_wait(mapping);
-   return generic_block_bmap(mapping, block, xfs_get_blocks);
+   return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
 
 STATIC int
-- 
2.17.0

[PATCH 15/34] iomap: add an iomap-based readpage and readpages implementation

2018-05-18 Thread Christoph Hellwig

Simply use iomap_apply to iterate over the file and a submit a bio for
each non-uptodate but mapped region and zero everything else.  Note that
as-is this can not be used for file systems with a blocksize smaller than
the page size, but that support will be added later.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c| 200 +-
 include/linux/iomap.h |   4 +
 2 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 7c1b071d115c..821671af2618 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016 Christoph Hellwig.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -103,6 +104,203 @@ iomap_sector(struct iomap *iomap, loff_t pos)
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
 }
 
+static inline bool
+iomap_block_needs_zeroing(struct inode *inode, loff_t pos, struct iomap *iomap)
+{
+   return iomap->type != IOMAP_MAPPED || pos > i_size_read(inode);
+}
+
+static void
+iomap_read_end_io(struct bio *bio)
+{
+   int error = blk_status_to_errno(bio->bi_status);
+   struct bio_vec *bvec;
+   int i;
+
+   bio_for_each_segment_all(bvec, bio, i)
+   page_endio(bvec->bv_page, false, error);
+   bio_put(bio);
+}
+
+static struct bio *
+iomap_read_bio_alloc(struct iomap *iomap, sector_t sector, loff_t length)
+{
+   int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   struct bio *bio = bio_alloc(GFP_NOFS, min(BIO_MAX_PAGES, nr_vecs));
+
+   bio->bi_opf = REQ_OP_READ;
+   bio->bi_iter.bi_sector = sector;
+   bio_set_dev(bio, iomap->bdev);
+   bio->bi_end_io = iomap_read_end_io;
+   return bio;
+}
+
+struct iomap_readpage_ctx {
+   struct page *cur_page;
+   boolcur_page_in_bio;
+   struct bio  *bio;
+   struct list_head*pages;
+};
+
+static loff_t
+iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void 
*data,
+   struct iomap *iomap)
+{
+   struct iomap_readpage_ctx *ctx = data;
+   struct page *page = ctx->cur_page;
+   unsigned poff = pos & (PAGE_SIZE - 1);
+   unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+   bool is_contig = false;
+   sector_t sector;
+
+   /* we don't support blocksize < PAGE_SIZE quite yet: */
+   WARN_ON_ONCE(pos != page_offset(page));
+   WARN_ON_ONCE(plen != PAGE_SIZE);
+
+   if (iomap_block_needs_zeroing(inode, pos, iomap)) {
+   zero_user(page, poff, plen);
+   SetPageUptodate(page);
+   goto done;
+   }
+
+   ctx->cur_page_in_bio = true;
+
+   /*
+* Try to merge into a previous segment if we can.
+*/
+   sector = iomap_sector(iomap, pos);
+   if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+   if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+   goto done;
+   is_contig = true;
+   }
+
+   if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
+   if (ctx->bio)
+   submit_bio(ctx->bio);
+   ctx->bio = iomap_read_bio_alloc(iomap, sector, length);
+   }
+
+   __bio_add_page(ctx->bio, page, plen, poff);
+done:
+   return plen;
+}
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+   struct iomap_readpage_ctx ctx = { .cur_page = page };
+   struct inode *inode = page->mapping->host;
+   unsigned poff;
+   loff_t ret;
+
+   WARN_ON_ONCE(page_has_buffers(page));
+
+   for (poff = 0; poff < PAGE_SIZE; poff += ret) {
+   ret = iomap_apply(inode, page_offset(page) + poff,
+   PAGE_SIZE - poff, 0, ops, &ctx,
+   iomap_readpage_actor);
+   if (ret <= 0) {
+   WARN_ON_ONCE(ret == 0);
+   SetPageError(page);
+   break;
+   }
+   }
+
+   if (ctx.bio) {
+   submit_bio(ctx.bio);
+   WARN_ON_ONCE(!ctx.cur_page_in_bio);
+   } else {
+   WARN_ON_ONCE(ctx.cur_page_in_bio);
+   unlock_page(page);
+   }
+   return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_readpage);
+
+static struct page *
+iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
+   loff_t length, loff_t *done)
+{
+   while (!list_empty(pages)) {
+   struct page *page = lru_to_page(pages);
+
+   if (page_offset(page) >= (u64)pos + length)
+   break;
+
+   list_del(&page->lru);
+

[PATCH 21/34] xfs: move locking into xfs_bmap_punch_delalloc_range

2018-05-18 Thread Christoph Hellwig

Both callers want the same looking, so do it only once.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c  | 2 --
 fs/xfs/xfs_bmap_util.c | 7 ---
 fs/xfs/xfs_iomap.c | 3 ---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f2333e351e07..5dd09e83c81c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -761,10 +761,8 @@ xfs_aops_discard_page(
"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
page, ip->i_ino, offset);
 
-   xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
PAGE_SIZE / i_blocksize(inode));
-   xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc 
mapping.");
 out_invalidate:
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c009bdf9fdce..1a55fc06f917 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -711,12 +711,11 @@ xfs_bmap_punch_delalloc_range(
struct xfs_iext_cursor  icur;
int error = 0;
 
-   ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
+   xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
-   return error;
+   goto out_unlock;
}
 
if (!xfs_iext_lookup_extent(ip, ifp, start_fsb, &icur, &got))
@@ -738,6 +737,8 @@ xfs_bmap_punch_delalloc_range(
break;
} while (xfs_iext_next_extent(ifp, &icur, &got));
 
+out_unlock:
+   xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index da6d1995e460..f949f0dd7382 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1203,11 +1203,8 @@ xfs_file_iomap_end_delalloc(
truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
 XFS_FSB_TO_B(mp, end_fsb) - 1);
 
-   xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
   end_fsb - start_fsb);
-   xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
if (error && !XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert(mp, "%s: unable to clean up ino %lld",
__func__, ip->i_ino);
-- 
2.17.0

[PATCH 18/34] xfs: use iomap for blocksize == PAGE_SIZE readpage and readpages

2018-05-18 Thread Christoph Hellwig

For file systems with a block size that equals the page size we never do
partial reads, so we can use the buffer_head-less iomap versions of
readpage and readpages without conflicting with the buffer_head structures
create later in write_begin.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 56e405572909..c631c457b444 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1402,6 +1402,8 @@ xfs_vm_readpage(
struct page *page)
 {
trace_xfs_vm_readpage(page->mapping->host, 1);
+   if (i_blocksize(page->mapping->host) == PAGE_SIZE)
+   return iomap_readpage(page, &xfs_iomap_ops);
return mpage_readpage(page, xfs_get_blocks);
 }
 
@@ -1413,6 +1415,8 @@ xfs_vm_readpages(
unsignednr_pages)
 {
trace_xfs_vm_readpages(mapping->host, nr_pages);
+   if (i_blocksize(mapping->host) == PAGE_SIZE)
+   return iomap_readpages(mapping, pages, nr_pages, 
&xfs_iomap_ops);
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 
-- 
2.17.0

[PATCH 11/34] iomap: move IOMAP_F_BOUNDARY to gfs2

2018-05-18 Thread Christoph Hellwig

Just define a range of fs specific flags and use that in gfs2 instead of
exposing this internal flag flobally.

Signed-off-by: Christoph Hellwig 
---
 fs/gfs2/bmap.c| 8 +---
 include/linux/iomap.h | 9 +++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cbeedd3cfb36..8efa6297e19c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -683,6 +683,8 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct 
iomap *iomap)
iomap->type = IOMAP_INLINE;
 }
 
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
 /**
  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
  * @inode: The inode
@@ -774,7 +776,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, 
loff_t length,
bh = mp.mp_bh[ip->i_height - 1];
len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, 
&eob);
if (eob)
-   iomap->flags |= IOMAP_F_BOUNDARY;
+   iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
iomap->length = (u64)len << inode->i_blkbits;
 
 out_release:
@@ -846,12 +848,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
if (iomap.length > bh_map->b_size) {
iomap.length = bh_map->b_size;
-   iomap.flags &= ~IOMAP_F_BOUNDARY;
+   iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
}
if (iomap.addr != IOMAP_NULL_ADDR)
map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
bh_map->b_size = iomap.length;
-   if (iomap.flags & IOMAP_F_BOUNDARY)
+   if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
set_buffer_boundary(bh_map);
if (iomap.flags & IOMAP_F_NEW)
set_buffer_new(bh_map);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 13d19b4c29a9..819e0cd2a950 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -27,8 +27,7 @@ struct vm_fault;
  * written data and requires fdatasync to commit them to persistent storage.
  */
 #define IOMAP_F_NEW0x01/* blocks have been newly allocated */
-#define IOMAP_F_BOUNDARY   0x02/* mapping ends at metadata boundary */
-#define IOMAP_F_DIRTY  0x04/* uncommitted metadata */
+#define IOMAP_F_DIRTY  0x02/* uncommitted metadata */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
@@ -36,6 +35,12 @@ struct vm_fault;
 #define IOMAP_F_MERGED 0x10/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED 0x20/* block shared with another file */
 
+/*
+ * Flags from 0x1000 up are for file system specific usage:
+ */
+#define IOMAP_F_PRIVATE0x1000
+
+
 /*
  * Magic value for addr:
  */
-- 
2.17.0

[PATCH 25/34] xfs: remove xfs_reflink_trim_irec_to_next_cow

2018-05-18 Thread Christoph Hellwig

In the only caller we just did a lookup in the COW extent tree for
the same offset.  Reuse that result and save a lookup, as well as
shortening the ilock hold time.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c| 25 +
 fs/xfs/xfs_reflink.c | 33 -
 fs/xfs/xfs_reflink.h |  2 --
 3 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a4b4a7037deb..354d26d66c12 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -383,11 +383,12 @@ xfs_map_blocks(
struct xfs_inode*ip = XFS_I(inode);
struct xfs_mount*mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
-   xfs_fileoff_t   offset_fsb, end_fsb;
+   xfs_fileoff_t   offset_fsb, end_fsb, cow_fsb = 0;
int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor  icur;
int error = 0;
int nimaps = 1;
+   boolcow_valid = false;
 
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -407,8 +408,11 @@ xfs_map_blocks(
 * it directly instead of looking up anything in the data fork.
 */
if (xfs_is_reflink_inode(ip) &&
-   xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, imap) &&
-   imap->br_startoff <= offset_fsb) {
+   xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, imap)) {
+   cow_fsb = imap->br_startoff;
+   cow_valid = true;
+   }
+   if (cow_valid && cow_fsb <= offset_fsb) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
 * Truncate can race with writeback since writeback doesn't
@@ -430,6 +434,10 @@ xfs_map_blocks(
 
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
imap, &nimaps, XFS_BMAPI_ENTIRE);
+   xfs_iunlock(ip, XFS_ILOCK_SHARED);
+   if (error)
+   return error;
+
if (!nimaps) {
/*
 * Lookup returns no match? Beyond eof? regardless,
@@ -451,16 +459,17 @@ xfs_map_blocks(
 * is a pending CoW reservation before the end of this extent,
 * so that we pick up the COW extents in the next iteration.
 */
-   xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
+   if (cow_valid &&
+   cow_fsb < imap->br_startoff + imap->br_blockcount) {
+   imap->br_blockcount = cow_fsb - imap->br_startoff;
+   trace_xfs_reflink_trim_irec(ip, imap);
+   }
+
if (imap->br_state == XFS_EXT_UNWRITTEN)
*type = XFS_IO_UNWRITTEN;
else
*type = XFS_IO_OVERWRITE;
}
-   xfs_iunlock(ip, XFS_ILOCK_SHARED);
-   if (error)
-   return error;
-
 done:
switch (*type) {
case XFS_IO_HOLE:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8e5eb8e70c89..ff76bc56ff3d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -484,39 +484,6 @@ xfs_reflink_allocate_cow(
return error;
 }
 
-/*
- * Trim an extent to end at the next CoW reservation past offset_fsb.
- */
-void
-xfs_reflink_trim_irec_to_next_cow(
-   struct xfs_inode*ip,
-   xfs_fileoff_t   offset_fsb,
-   struct xfs_bmbt_irec*imap)
-{
-   struct xfs_ifork*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-   struct xfs_bmbt_irecgot;
-   struct xfs_iext_cursor  icur;
-
-   if (!xfs_is_reflink_inode(ip))
-   return;
-
-   /* Find the extent in the CoW fork. */
-   if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
-   return;
-
-   /* This is the extent before; try sliding up one. */
-   if (got.br_startoff < offset_fsb) {
-   if (!xfs_iext_next_extent(ifp, &icur, &got))
-   return;
-   }
-
-   if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
-   return;
-
-   imap->br_blockcount = got.br_startoff - imap->br_startoff;
-   trace_xfs_reflink_trim_irec(ip, imap);
-}
-
 /*
  * Cancel CoW reservations for some block range of an inode.
  *
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 15a456492667..e8d4d50c629f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -32,8 +32,6 @@ extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
-extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
-   xfs_fileoff_t offset_fsb, struct xfs_bmbt_ir

[PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-18 Thread Christoph Hellwig

For now just limited to blocksize == PAGE_SIZE, where we can simply read
in the full page in write begin, and just set the whole page dirty after
copying data into it.  This code is enabled by default and XFS will now
be feed pages without buffer heads in ->writepage and ->writepages.

If a file system sets the IOMAP_F_BUFFER_HEAD flag on the iomap the old
path will still be used, this both helps the transition in XFS and
prepares for the gfs2 migration to the iomap infrastructure.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c| 132 ++
 fs/xfs/xfs_iomap.c|   6 +-
 include/linux/iomap.h |   2 +
 3 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 821671af2618..cd4c563db80a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -314,6 +314,58 @@ iomap_write_failed(struct inode *inode, loff_t pos, 
unsigned len)
truncate_pagecache_range(inode, max(pos, i_size), pos + len);
 }
 
+static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page 
*page,
+   unsigned poff, unsigned plen, struct iomap *iomap)
+{
+   struct bio_vec bvec;
+   struct bio bio;
+   int ret;
+
+   bio_init(&bio, &bvec, 1);
+   bio.bi_opf = REQ_OP_READ;
+   bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+   bio_set_dev(&bio, iomap->bdev);
+   __bio_add_page(&bio, page, plen, poff);
+   ret = submit_bio_wait(&bio);
+   if (ret < 0 && iomap_block_needs_zeroing(inode, block_start, iomap))
+   zero_user(page, poff, plen);
+   return ret;
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+   struct page *page, struct iomap *iomap)
+{
+   loff_t block_size = i_blocksize(inode);
+   loff_t block_start = pos & ~(block_size - 1);
+   loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+   unsigned poff = block_start & (PAGE_SIZE - 1);
+   unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
block_start);
+   int status;
+
+   WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+
+   if (PageUptodate(page))
+   return 0;
+
+   if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
+   unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+   unsigned pend = poff + plen;
+
+   if (poff < from || pend > to)
+   zero_user_segments(page, poff, from, to, pend);
+   } else {
+   status = iomap_read_page_sync(inode, block_start, page,
+   poff, plen, iomap);
+   if (status < 0)
+   return status;
+   SetPageUptodate(page);
+   }
+
+   return 0;
+}
+
 static int
 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned 
flags,
struct page **pagep, struct iomap *iomap)
@@ -331,7 +383,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len, unsigned flags,
if (!page)
return -ENOMEM;
 
-   status = __block_write_begin_int(page, pos, len, NULL, iomap);
+   if (iomap->flags & IOMAP_F_BUFFER_HEAD)
+   status = __block_write_begin_int(page, pos, len, NULL, iomap);
+   else
+   status = __iomap_write_begin(inode, pos, len, page, iomap);
if (unlikely(status)) {
unlock_page(page);
put_page(page);
@@ -344,14 +399,63 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len, unsigned flags,
return status;
 }
 
+int
+iomap_set_page_dirty(struct page *page)
+{
+   struct address_space *mapping = page_mapping(page);
+   int newly_dirty;
+
+   if (unlikely(!mapping))
+   return !TestSetPageDirty(page);
+
+   /*
+* Lock out page->mem_cgroup migration to keep PageDirty
+* synchronized with per-memcg dirty page counters.
+*/
+   lock_page_memcg(page);
+   newly_dirty = !TestSetPageDirty(page);
+   if (newly_dirty)
+   __set_page_dirty(page, mapping, 0);
+   unlock_page_memcg(page);
+
+   if (newly_dirty)
+   __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+   return newly_dirty;
+}
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+   unsigned copied, struct page *page, struct iomap *iomap)
+{
+   unsigned start = pos & (PAGE_SIZE - 1);
+
+   if (unlikely(copied < len)) {
+   /* see block_write_end() for an explanation */
+   if (!PageUptodate(page))
+   copied = 0;
+   if (iomap_block_needs_zeroing(inode, pos, iomap))
+   zero_user(page, start + copied, len - copied);
+   }
+
+   flush_dcache_page(page);
+   SetPageUptodate(page);
+   iomap_set_page_dirty(p

[PATCH 27/34] xfs: don't clear imap_valid for a non-uptodate buffers

2018-05-18 Thread Christoph Hellwig

Finding a buffer that isn't uptodate doesn't invalidate the mapping for
any given block.  The last_sector check will already take care of starting
another ioend as soon as we find any non-update buffer, and if the current
mapping doesn't include the next uptodate buffer the xfs_imap_valid check
will take care of it.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b1dee2171194..82fd08c29f7f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -859,15 +859,12 @@ xfs_writepage_map(
break;
 
/*
-* Block does not contain valid data, skip it, mark the current
-* map as invalid because we have a discontiguity. This ensures
-* we put subsequent writeable buffers into a new ioend.
+* Block does not contain valid data, skip it.
 */
if (!buffer_uptodate(bh)) {
if (PageUptodate(page))
ASSERT(buffer_mapped(bh));
uptodate = false;
-   wpc->imap_valid = false;
continue;
}
 
-- 
2.17.0

[PATCH 28/34] xfs: remove the imap_valid flag

2018-05-18 Thread Christoph Hellwig

Simplify the way we check for a valid imap - we know we have a valid
mapping after xfs_map_blocks returned successfully, and we know we can
call xfs_imap_valid on any imap, as it will always fail on a
zero-initialized map.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 82fd08c29f7f..f01c1dd737ec 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -42,7 +42,6 @@
  */
 struct xfs_writepage_ctx {
struct xfs_bmbt_irecimap;
-   boolimap_valid;
unsigned intio_type;
struct xfs_ioend*ioend;
sector_tlast_block;
@@ -868,10 +867,6 @@ xfs_writepage_map(
continue;
}
 
-   /* Check to see if current map spans this file offset */
-   if (wpc->imap_valid)
-   wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-file_offset);
/*
 * If we don't have a valid map, now it's time to get a new one
 * for this offset.  This will convert delayed allocations
@@ -879,16 +874,14 @@ xfs_writepage_map(
 * a valid map, it means we landed in a hole and we skip the
 * block.
 */
-   if (!wpc->imap_valid) {
+   if (!xfs_imap_valid(inode, &wpc->imap, file_offset)) {
error = xfs_map_blocks(inode, file_offset, &wpc->imap,
 &wpc->io_type);
if (error)
goto out;
-   wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
-file_offset);
}
 
-   if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) {
+   if (wpc->io_type == XFS_IO_HOLE) {
/*
 * set_page_dirty dirties all buffers in a page, 
independent
 * of their state.  The dirty state however is entirely
-- 
2.17.0

[PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads

2018-05-18 Thread Christoph Hellwig

After already supporting a simple implementation of buffered writes for
the blocksize == PAGE_SIZE case in the last commit this adds full support
even for smaller block sizes.   There are three bits of per-block
information in the buffer_head structure that really matter for the iomap
read and write path:

 - uptodate status (BH_uptodate)
 - marked as currently under read I/O (BH_Async_Read)
 - marked as currently under write I/O (BH_Async_Write)

Instead of having new per-block structures this now adds a per-page
structure called struct iomap_page to track this information in a slightly
different form:

 - a bitmap for the per-block uptodate status.  For worst case of a 64k
   page size system this bitmap needs to contain 128 bits.  For the
   typical 4k page size case it only needs 8 bits, although we still
   need a full unsigned long due to the way the atomic bitmap API works.
 - two atomic_t counters are used to track the outstanding read and write
   counts

There is quite a bit of boilerplate code as the buffered I/O path uses
various helper methods, but the actual code is very straight forward.

In this commit the code can't actually be used yet, as we need to
switch from the old implementation to the new one together with the
XFS writeback code.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c| 260 +-
 include/linux/iomap.h |  31 +
 2 files changed, 262 insertions(+), 29 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index cd4c563db80a..8d62f0eb874c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -110,6 +111,107 @@ iomap_block_needs_zeroing(struct inode *inode, loff_t 
pos, struct iomap *iomap)
return iomap->type != IOMAP_MAPPED || pos > i_size_read(inode);
 }
 
+static struct iomap_page *
+iomap_page_create(struct inode *inode, struct page *page)
+{
+   struct iomap_page *iop = to_iomap_page(page);
+
+   if (iop || i_blocksize(inode) == PAGE_SIZE)
+   return iop;
+
+   iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
+   atomic_set(&iop->read_count, 0);
+   atomic_set(&iop->write_count, 0);
+   bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+   set_page_private(page, (unsigned long)iop);
+   SetPagePrivate(page);
+   return iop;
+}
+
+/*
+ * Calculate the range inside the page that we actually need to read.
+ */
+static void
+iomap_read_calculate_range(struct inode *inode, struct iomap_page *iop,
+   loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
+{
+   unsigned poff = *pos & (PAGE_SIZE - 1);
+   unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+
+   if (iop) {
+   unsigned block_size = i_blocksize(inode);
+   unsigned first = poff >> inode->i_blkbits;
+   unsigned last = (poff + plen - 1) >> inode->i_blkbits;
+   unsigned int i;
+
+   /* move forward for each leading block marked uptodate */
+   for (i = first; i <= last; i++) {
+   if (!test_bit(i, iop->uptodate))
+   break;
+   *pos += block_size;
+   poff += block_size;
+   plen -= block_size;
+   }
+
+   /* truncate len if we find any trailing uptodate block(s) */
+   for ( ; i <= last; i++) {
+   if (test_bit(i, iop->uptodate)) {
+   plen -= (last - i + 1) * block_size;
+   break;
+   }
+   }
+   }
+
+   *offp = poff;
+   *lenp = plen;
+}
+
+static void
+iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
+{
+   struct iomap_page *iop = to_iomap_page(page);
+   struct inode *inode = page->mapping->host;
+   unsigned first = off >> inode->i_blkbits;
+   unsigned last = (off + len - 1) >> inode->i_blkbits;
+   unsigned int i;
+   bool uptodate = true;
+
+   if (iop) {
+   for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) {
+   if (i >= first && i <= last)
+   set_bit(i, iop->uptodate);
+   else if (!test_bit(i, iop->uptodate))
+   uptodate = false;
+   }
+   }
+
+   if (uptodate && !PageError(page))
+   SetPageUptodate(page);
+}
+
+static void
+iomap_read_finish(struct iomap_page *iop, struct page *page)
+{
+   if (!iop || atomic_dec_and_test(&iop->read_count))
+   unlock_page(page);
+}
+
+static void
+iomap_read_page_end_io(struct bio_vec *bvec, int error)
+{
+   struct page *page = bvec->bv_page;
+   struct iomap_page *iop = to_iomap_page(page);
+
+   if (unlikely(error)) {
+   ClearPageUptodate(page);
+   SetPag

sub-page blocksize support in iomap non-buffer head path

2018-05-18 Thread Christoph Hellwig

Hi all,

this series adds support for buffered I/O without buffer heads for
block size < PAGE_SIZE to the iomap and XFS code.

A git tree is available at:

git://git.infradead.org/users/hch/xfs.git xfs-iomap-read 
xfs-remove-bufferheads.2

Gitweb:


http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/xfs-remove-bufferheads.2

Changes since v1:
 - call iomap_page_create in page_mkwrite to fix generic/095
 - split into a separate series

[PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads

2018-05-18 Thread Christoph Hellwig

Switch to using the iomap_page structure for checking sub-page uptodate
status and track sub-page I/O completion status, and remove large
quantities of boilerplate code working around buffer heads.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c  | 534 +++--
 fs/xfs/xfs_buf.h   |   1 -
 fs/xfs/xfs_iomap.c |   3 -
 fs/xfs/xfs_super.c |   2 +-
 fs/xfs/xfs_trace.h |  18 +-
 5 files changed, 77 insertions(+), 481 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index efa2cbb27d67..fd664ba423e6 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -32,9 +32,6 @@
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
-#include 
-#include 
-#include 
 #include 
 
 /*
@@ -46,25 +43,6 @@ struct xfs_writepage_ctx {
struct xfs_ioend*ioend;
 };
 
-void
-xfs_count_page_state(
-   struct page *page,
-   int *delalloc,
-   int *unwritten)
-{
-   struct buffer_head  *bh, *head;
-
-   *delalloc = *unwritten = 0;
-
-   bh = head = page_buffers(page);
-   do {
-   if (buffer_unwritten(bh))
-   (*unwritten) = 1;
-   else if (buffer_delay(bh))
-   (*delalloc) = 1;
-   } while ((bh = bh->b_this_page) != head);
-}
-
 struct block_device *
 xfs_find_bdev_for_inode(
struct inode*inode)
@@ -97,67 +75,17 @@ xfs_finish_page_writeback(
struct bio_vec  *bvec,
int error)
 {
+   struct iomap_page   *iop = to_iomap_page(bvec->bv_page);
+
if (error) {
SetPageError(bvec->bv_page);
mapping_set_error(inode->i_mapping, -EIO);
}
-   end_page_writeback(bvec->bv_page);
-}
 
-/*
- * We're now finished for good with this page.  Update the page state via the
- * associated buffer_heads, paying attention to the start and end offsets that
- * we need to process on the page.
- *
- * Note that we open code the action in end_buffer_async_write here so that we
- * only have to iterate over the buffers attached to the page once.  This is 
not
- * only more efficient, but also ensures that we only calls end_page_writeback
- * at the end of the iteration, and thus avoids the pitfall of having the page
- * and buffers potentially freed after every call to end_buffer_async_write.
- */
-static void
-xfs_finish_buffer_writeback(
-   struct inode*inode,
-   struct bio_vec  *bvec,
-   int error)
-{
-   struct buffer_head  *head = page_buffers(bvec->bv_page), *bh = head;
-   boolbusy = false;
-   unsigned intoff = 0;
-   unsigned long   flags;
-
-   ASSERT(bvec->bv_offset < PAGE_SIZE);
-   ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
-   ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
-   ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
-
-   local_irq_save(flags);
-   bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
-   do {
-   if (off >= bvec->bv_offset &&
-   off < bvec->bv_offset + bvec->bv_len) {
-   ASSERT(buffer_async_write(bh));
-   ASSERT(bh->b_end_io == NULL);
-
-   if (error) {
-   mark_buffer_write_io_error(bh);
-   clear_buffer_uptodate(bh);
-   SetPageError(bvec->bv_page);
-   } else {
-   set_buffer_uptodate(bh);
-   }
-   clear_buffer_async_write(bh);
-   unlock_buffer(bh);
-   } else if (buffer_async_write(bh)) {
-   ASSERT(buffer_locked(bh));
-   busy = true;
-   }
-   off += bh->b_size;
-   } while ((bh = bh->b_this_page) != head);
-   bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-   local_irq_restore(flags);
+   ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
+   ASSERT(!iop || atomic_read(&iop->write_count) > 0);
 
-   if (!busy)
+   if (!iop || atomic_dec_and_test(&iop->write_count))
end_page_writeback(bvec->bv_page);
 }
 
@@ -191,12 +119,8 @@ xfs_destroy_ioend(
next = bio->bi_private;
 
/* walk each page on bio, ending page IO on them */
-   bio_for_each_segment_all(bvec, bio, i) {
-   if (page_has_buffers(bvec->bv_page))
-   xfs_finish_buffer_writeback(inode, bvec, error);
-   else
-   xfs_finish_page_writeback(inode, bvec, error);
-   }
+   bio_for_each_segment_all(bvec, bio, i)
+   xfs_finish_page_writeback

Re: [PATCH v2 01/26] rculist: introduce list_next_or_null_rr_rcu()

2018-05-18 Thread Linus Torvalds

On Fri, May 18, 2018 at 6:07 AM Roman Pen 
wrote:

> Function is going to be used in transport over RDMA module
> in subsequent patches.

Does this really merit its own helper macro in a generic header?

It honestly smells more like "just have an inline helper function that is
specific to rdma" to me. Particularly since it's probably just one specific
list where you want this oddly specific behavior.

Also, if we really want a round-robin list traversal macro, this isn't the
way it should be implemented, I suspect, and it probably shouldn't be
RCU-specific to begin with.

Side note: I notice that I should already  have been more critical of even
the much simpler "list_next_or_null_rcu()" macro. The "documentation"
comment above the macro is pure and utter cut-and-paste garbage.

Paul, mind giving this a look?

 Linus

Re: [PATCH 01/34] block: add a lower-level bio_add_page interface

2018-05-18 Thread Jens Axboe

On 5/18/18 10:47 AM, Christoph Hellwig wrote:
> For the upcoming removal of buffer heads in XFS we need to keep track of
> the number of outstanding writeback requests per page.  For this we need
> to know if bio_add_page merged a region with the previous bvec or not.
> Instead of adding additional arguments this refactors bio_add_page to
> be implemented using three lower level helpers which users like XFS can
> use directly if they care about the merge decisions.

Reviewed-by: Jens Axboe 

-- 
Jens Axboe

[PATCH 34/34] xfs: allow writeback on pages without buffer heads

2018-05-18 Thread Christoph Hellwig

Disable the IOMAP_F_BUFFER_HEAD flag on file systems with a block size
equal to the page size, and deal with pages without buffer heads in
writeback.  Thanks to the previous refactoring this is basically trivial
now.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c  | 47 +-
 fs/xfs/xfs_iomap.c |  3 ++-
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 492f4a4b1deb..efa2cbb27d67 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -91,6 +91,19 @@ xfs_find_daxdev_for_inode(
return mp->m_ddev_targp->bt_daxdev;
 }
 
+static void
+xfs_finish_page_writeback(
+   struct inode*inode,
+   struct bio_vec  *bvec,
+   int error)
+{
+   if (error) {
+   SetPageError(bvec->bv_page);
+   mapping_set_error(inode->i_mapping, -EIO);
+   }
+   end_page_writeback(bvec->bv_page);
+}
+
 /*
  * We're now finished for good with this page.  Update the page state via the
  * associated buffer_heads, paying attention to the start and end offsets that
@@ -103,7 +116,7 @@ xfs_find_daxdev_for_inode(
  * and buffers potentially freed after every call to end_buffer_async_write.
  */
 static void
-xfs_finish_page_writeback(
+xfs_finish_buffer_writeback(
struct inode*inode,
struct bio_vec  *bvec,
int error)
@@ -178,9 +191,12 @@ xfs_destroy_ioend(
next = bio->bi_private;
 
/* walk each page on bio, ending page IO on them */
-   bio_for_each_segment_all(bvec, bio, i)
-   xfs_finish_page_writeback(inode, bvec, error);
-
+   bio_for_each_segment_all(bvec, bio, i) {
+   if (page_has_buffers(bvec->bv_page))
+   xfs_finish_buffer_writeback(inode, bvec, error);
+   else
+   xfs_finish_page_writeback(inode, bvec, error);
+   }
bio_put(bio);
}
 
@@ -792,13 +808,16 @@ xfs_writepage_map(
 {
LIST_HEAD(submit_list);
struct xfs_ioend*ioend, *next;
-   struct buffer_head  *bh;
+   struct buffer_head  *bh = NULL;
ssize_t len = i_blocksize(inode);
int error = 0;
int count = 0;
loff_t  file_offset;/* file offset of page */
unsignedpoffset;/* offset into page */
 
+   if (page_has_buffers(page))
+   bh = page_buffers(page);
+
/*
 * Walk the blocks on the page, and we we run off then end of the
 * current map or find the current map invalid, grab a new one.
@@ -807,11 +826,9 @@ xfs_writepage_map(
 * replace the bufferhead with some other state tracking mechanism in
 * future.
 */
-   file_offset = page_offset(page);
-   bh = page_buffers(page);
-   for (poffset = 0;
+   for (poffset = 0, file_offset = page_offset(page);
 poffset < PAGE_SIZE;
-poffset += len, file_offset += len, bh = bh->b_this_page) {
+poffset += len, file_offset += len) {
/* past the range we are writing, so nothing more to write. */
if (file_offset >= end_offset)
break;
@@ -819,9 +836,10 @@ xfs_writepage_map(
/*
 * Block does not contain valid data, skip it.
 */
-   if (!buffer_uptodate(bh)) {
+   if (bh && !buffer_uptodate(bh)) {
if (PageUptodate(page))
ASSERT(buffer_mapped(bh));
+   bh = bh->b_this_page;
continue;
}
 
@@ -846,10 +864,15 @@ xfs_writepage_map(
 * meaningless for holes (!mapped && uptodate), so 
check we did
 * have a buffer covering a hole here and continue.
 */
+   if (bh)
+   bh = bh->b_this_page;
continue;
}
 
-   xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
+   if (bh) {
+   xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
+   bh = bh->b_this_page;
+   }
xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
&submit_list);
count++;
@@ -949,8 +972,6 @@ xfs_do_writepage(
 
trace_xfs_writepage(inode, page, 0, 0);
 
-   ASSERT(page_has_buffers(page));
-
/*
 * Refuse to write the page out if we are called from reclaim context.
 *
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f949f0dd7382..93c40da33

[PATCH 33/34] xfs: do not set the page uptodate in xfs_writepage_map

2018-05-18 Thread Christoph Hellwig

We already track the page uptodate status based on the buffer uptodate
status, which is updated whenever reading or zeroing blocks.

This code has been there since commit a ptool commit in 2002, which
claims to:

"merge" the 2.4 fsx fix for block size < page size to 2.5.  This needed
major changes to actually fit.

and isn't present in other writepage implementations.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a4e53e0a57c2..492f4a4b1deb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -796,7 +796,6 @@ xfs_writepage_map(
ssize_t len = i_blocksize(inode);
int error = 0;
int count = 0;
-   booluptodate = true;
loff_t  file_offset;/* file offset of page */
unsignedpoffset;/* offset into page */
 
@@ -823,7 +822,6 @@ xfs_writepage_map(
if (!buffer_uptodate(bh)) {
if (PageUptodate(page))
ASSERT(buffer_mapped(bh));
-   uptodate = false;
continue;
}
 
@@ -857,9 +855,6 @@ xfs_writepage_map(
count++;
}
 
-   if (uptodate && poffset == PAGE_SIZE)
-   SetPageUptodate(page);
-
ASSERT(wpc->ioend || list_empty(&submit_list));
 
 out:
-- 
2.17.0

[PATCH 32/34] xfs: refactor the tail of xfs_writepage_map

2018-05-18 Thread Christoph Hellwig

Rejuggle how we deal with the different error vs non-error and have
ioends vs not have ioend cases to keep the fast path streamlined, and
the duplicate code at a minimum.

Signed-off-by: Christoph Hellwig 
---
 fs/xfs/xfs_aops.c | 65 +++
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index dd92d99df51f..a4e53e0a57c2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -883,7 +883,14 @@ xfs_writepage_map(
 * submission of outstanding ioends on the writepage context so they are
 * treated correctly on error.
 */
-   if (count) {
+   if (unlikely(error)) {
+   if (!count) {
+   xfs_aops_discard_page(page);
+   ClearPageUptodate(page);
+   unlock_page(page);
+   goto done;
+   }
+
/*
 * If the page was not fully cleaned, we need to ensure that the
 * higher layers come back to it correctly.  That means we need
@@ -892,43 +899,35 @@ xfs_writepage_map(
 * so another attempt to write this page in this writeback sweep
 * will be made.
 */
-   if (error) {
-   set_page_writeback_keepwrite(page);
-   } else {
-   clear_page_dirty_for_io(page);
-   set_page_writeback(page);
-   }
-   unlock_page(page);
-
-   /*
-* Preserve the original error if there was one, otherwise catch
-* submission errors here and propagate into subsequent ioend
-* submissions.
-*/
-   list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-   int error2;
-
-   list_del_init(&ioend->io_list);
-   error2 = xfs_submit_ioend(wbc, ioend, error);
-   if (error2 && !error)
-   error = error2;
-   }
-   } else if (error) {
-   xfs_aops_discard_page(page);
-   ClearPageUptodate(page);
-   unlock_page(page);
+   set_page_writeback_keepwrite(page);
} else {
-   /*
-* We can end up here with no error and nothing to write if we
-* race with a partial page truncate on a sub-page block sized
-* filesystem. In that case we need to mark the page clean.
-*/
clear_page_dirty_for_io(page);
set_page_writeback(page);
-   unlock_page(page);
-   end_page_writeback(page);
}
 
+   unlock_page(page);
+
+   /*
+* Preserve the original error if there was one, otherwise catch
+* submission errors here and propagate into subsequent ioend
+* submissions.
+*/
+   list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+   int error2;
+
+   list_del_init(&ioend->io_list);
+   error2 = xfs_submit_ioend(wbc, ioend, error);
+   if (error2 && !error)
+   error = error2;
+   }
+
+   /*
+* We can end up here with no error and nothing to write if we race with
+* a partial page truncate on a sub-page block sized filesystem.
+*/
+   if (!count)
+   end_page_writeback(page);
+done:
mapping_set_error(page->mapping, error);
return error;
 }
-- 
2.17.0

1 2 >

1 - 100 of 133 matches

Mail list logo