Re: [PATCH 08/28] ibtrs_clt: add Makefile and Kconfig
Hi Jack, [auto build test WARNING on linus/master] [also build test WARNING on v4.11-rc3 next-20170324] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Jack-Wang/INFINIBAND-NETWORK-BLOCK-DEVICE-IBNBD/20170325-101629 config: i386-allmodconfig (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): In file included from include/linux/printk.h:329:0, from include/linux/kernel.h:13, from include/linux/list.h:8, from include/linux/module.h:9, from drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:47: drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'process_open_rsp': >> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:859:7: warning: cast to >> pointer from integer of different size [-Wint-to-pointer-cast] (void *)sess->srv_rdma_addr[i], ^ include/linux/dynamic_debug.h:127:10: note: in definition of macro 'dynamic_pr_debug' ##__VA_ARGS__); \ ^~~ >> include/rdma/ibtrs_log.h:51:23: note: in expansion of macro 'pr_debug' #define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__) ^~~~ >> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:857:3: note: in expansion of >> macro 'DEB' DEB("Adding contiguous buffer %d, size %u, addr: 0x%p," ^~~ In file included from include/linux/kernel.h:13:0, from include/linux/list.h:8, from include/linux/module.h:9, from drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:47: drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'ibtrs_map_desc': >> include/rdma/ibtrs_log.h:51:32: warning: format '%llu' expects argument of >> type 'long long unsigned int', but argument 4 has type 'dma_addr_t {aka >> unsigned int}' [-Wformat=] #define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__) ^ include/linux/printk.h:285:21: note: in definition of macro 'pr_fmt' #define pr_fmt(fmt) fmt ^~~ include/linux/printk.h:333:2: note: in expansion of macro 'dynamic_pr_debug' dynamic_pr_debug(fmt, ##__VA_ARGS__) ^~~~ >> include/rdma/ibtrs_log.h:51:23: note: in expansion of macro 'pr_debug' #define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__) ^~~~ drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1106:2: note: in expansion of macro 'DEB' DEB("dma_addr %llu, key %u, dma_len %u\n", dma_addr, rkey, dma_len); ^~~ drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'ibtrs_post_send_rdma': >> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1440:23: warning: cast from >> pointer to integer of different size [-Wpointer-to-int-cast] addr + off, (u64)req->iu, imm, ^ drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'ibtrs_post_send_rdma_desc': drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1565:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] addr, (u64)req->iu, imm, ^ drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'process_err_wc': drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1882:7: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] iu = (struct ibtrs_iu *)wc->wr_id; ^ drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'process_wcs': drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1922:8: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] iu = (struct ibtrs_iu *)wc.wr_id; ^ -- In file included from include/linux/printk.h:6:0, from drivers/infiniband/ulp/ibtrs_client/../ibtrs_lib/ibtrs-proto.c:48: drivers/infiniband/ulp/ibtrs_client/../ibtrs_lib/ibtrs-proto.c: In function 'ibtrs_validate_msg_sess_open_resp': include/linux/kern_levels.h:4:18: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'unsigned int' [-Wformat=] #define KERN_SOH "\001" /* ASCII Start Of Header */ ^ include/linux/kern_levels.h:10:18: note: in expansion of macro 'KERN_SOH' #define KERN_ERR KERN_SOH "3" /* error conditions */ ^~~~ include/linux/printk.h:301:9: note: in expansion of macro 'KERN_ERR' printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS
Re: [PATCH] block: constify struct blk_integrity_profile
On 03/24/2017 07:03 PM, Eric Biggers wrote: > From: Eric Biggers> > blk_integrity_profile's are never modified, so mark them 'const' so that > they are placed in .rodata and benefit from memory protection. Thanks, that's a nice change. Applied for 4.12. -- Jens Axboe
[PATCH] block: constify struct blk_integrity_profile
From: Eric Biggersblk_integrity_profile's are never modified, so mark them 'const' so that they are placed in .rodata and benefit from memory protection. Signed-off-by: Eric Biggers --- block/blk-integrity.c | 2 +- block/t10-pi.c | 8 include/linux/genhd.h | 10 +- include/linux/t10-pi.h | 8 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 9f0ff5ba4f84..b3622cb00fc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) return 0; } -static struct blk_integrity_profile nop_profile = { +static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, diff --git a/block/t10-pi.c b/block/t10-pi.c index 2c97912335a9..680c6d636298 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity_profile t10_pi_type1_crc = { +const struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn= t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity_profile t10_pi_type1_ip = { +const struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn= t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity_profile t10_pi_type3_crc = { +const struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn= t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity_profile t10_pi_type3_ip = { +const struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn= t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0..9e11082c7f9b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,11 +159,11 @@ struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct blk_integrity { - struct blk_integrity_profile*profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 9fba9dd33544..9375d23a24e7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -34,9 +34,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity_profile t10_pi_type1_crc; -extern struct blk_integrity_profile t10_pi_type1_ip; -extern struct blk_integrity_profile t10_pi_type3_crc; -extern struct blk_integrity_profile t10_pi_type3_ip; +extern const struct blk_integrity_profile t10_pi_type1_crc; +extern const struct blk_integrity_profile t10_pi_type1_ip; +extern const struct blk_integrity_profile t10_pi_type3_crc; +extern const struct blk_integrity_profile t10_pi_type3_ip; #endif -- 2.12.1.578.ge9c3154ca4-goog
[PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
blkg_conf_prep() currently calls blkg_lookup_create() while holding request queue spinlock. This means allocating memory for struct blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM failures in call paths like below: pcpu_alloc+0x68f/0x710 __alloc_percpu_gfp+0xd/0x10 __percpu_counter_init+0x55/0xc0 cfq_pd_alloc+0x3b2/0x4e0 blkg_alloc+0x187/0x230 blkg_create+0x489/0x670 blkg_lookup_create+0x9a/0x230 blkg_conf_prep+0x1fb/0x240 __cfqg_set_weight_device.isra.105+0x5c/0x180 cfq_set_weight_on_dfl+0x69/0xc0 cgroup_file_write+0x39/0x1c0 kernfs_fop_write+0x13f/0x1d0 __vfs_write+0x23/0x120 vfs_write+0xc2/0x1f0 SyS_write+0x44/0xb0 entry_SYSCALL_64_fastpath+0x18/0xad In the code path above, percpu allocator cannot call vmalloc() due to queue spinlock. A failure in this call path gives grief to tools which are trying to configure io weights. We see occasional failures happen shortly after reboots even when system is not under any memory pressure. Machines with a lot of cpus are more vulnerable to this condition. Do struct blkcg_gq allocations outside the queue spinlock to allow blocking during memory allocations. Suggested-by: Tejun HeoSigned-off-by: Tahsin Erdogan --- v6: Due to Jens' objection to conditionally dropping locks based on gfp flags, go back to v1 approach. Perform queue bypass and policy enabled checks at every iteration. Add blkg_lookup_check() to reduce code duplication. v5: Removed stale blkg_alloc() in blkcg_init_queue() Pushed down radix_tree_preload() into blkg_create() because it disables preemption on return and makes it unsafe to call blocking memory allocations. v4: Simplified error checking in blkg_create() Factored out __blkg_lookup_create() v3: Pushed down all blkg allocations into blkg_create() v2: Moved blkg creation into blkg_lookup_create() to avoid duplicating blkg_lookup_create() logic. block/blk-cgroup.c | 123 ++--- 1 file changed, 98 insertions(+), 25 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bbe7ee00bd3d..7c2947128f58 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, } EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); +/* Performs queue bypass and policy enabled checks then looks up blkg. */ +static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, + const struct blkcg_policy *pol, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) + return ERR_PTR(-EOPNOTSUPP); + + /* +* This could be the first entry point of blkcg implementation and +* we shouldn't allow anything to go through for a bypassing queue. +*/ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + + return __blkg_lookup(blkcg, q, true /* update_hint */); +} + /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup @@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; + struct request_queue *q; struct blkcg_gq *blkg; struct module *owner; unsigned int major, minor; @@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); - return -ENODEV; + ret = -ENODEV; + goto fail; } - rcu_read_lock(); - spin_lock_irq(disk->queue->queue_lock); + q = disk->queue; - if (blkcg_policy_enabled(disk->queue, pol)) - blkg = blkg_lookup_create(blkcg, disk->queue); - else - blkg = ERR_PTR(-EOPNOTSUPP); + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) + goto success; + + /* +* Create blkgs walking down from blkcg_root to @blkcg, so that all +* non-root blkgs have access to their parents. +*/ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent; + struct blkcg_gq *new_blkg; + + parent = blkcg_parent(blkcg); + while (parent && !__blkg_lookup(parent, q, false)) { + pos =
Re: [PATCH] block: correct documentation for blkdev_issue_discard() flags
On 03/24/2017 03:39 PM, Eric Biggers wrote: > On Mon, Jan 23, 2017 at 11:41:39AM -0800, Eric Biggers wrote: >> From: Eric Biggers>> >> BLKDEV_IFL_* flags no longer exist; blkdev_issue_discard() now actually >> takes BLKDEV_DISCARD_* flags. >> >> Signed-off-by: Eric Biggers >> --- >> block/blk-lib.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/block/blk-lib.c b/block/blk-lib.c >> index ed89c8f4b2a0..463b76dd566f 100644 >> --- a/block/blk-lib.c >> +++ b/block/blk-lib.c >> @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard); >> * @sector: start sector >> * @nr_sects: number of sectors to discard >> * @gfp_mask: memory allocation flags (for bio_alloc) >> - * @flags: BLKDEV_IFL_* flags to control behaviour >> + * @flags: BLKDEV_DISCARD_* flags to control behaviour >> * >> * Description: >> *Issue a discard request for the sectors in question. >> -- >> 2.11.0.483.g087da7b7c-goog >> > > Ping? Sorry, looks like that got lost. I've applied this, and your other patch. Thanks for the reminder! -- Jens Axboe
Re: [PATCH] block: correct documentation for blkdev_issue_discard() flags
On Mon, Jan 23, 2017 at 11:41:39AM -0800, Eric Biggers wrote: > From: Eric Biggers> > BLKDEV_IFL_* flags no longer exist; blkdev_issue_discard() now actually > takes BLKDEV_DISCARD_* flags. > > Signed-off-by: Eric Biggers > --- > block/blk-lib.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/block/blk-lib.c b/block/blk-lib.c > index ed89c8f4b2a0..463b76dd566f 100644 > --- a/block/blk-lib.c > +++ b/block/blk-lib.c > @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard); > * @sector: start sector > * @nr_sects:number of sectors to discard > * @gfp_mask:memory allocation flags (for bio_alloc) > - * @flags: BLKDEV_IFL_* flags to control behaviour > + * @flags: BLKDEV_DISCARD_* flags to control behaviour > * > * Description: > *Issue a discard request for the sectors in question. > -- > 2.11.0.483.g087da7b7c-goog > Ping?
Re: [PATCH 0/4] nbd fixes for this cycle
On 03/24/2017 12:08 PM, Josef Bacik wrote: > These 4 patches are to fix up various regressions and problems in NBD. The > ERESTARTSYS is the biggest patch but has been pretty well tested with a debug > patch that forced the behavior to happen. Everything else is relatively > small, > and the queue timeout patch is a regression from last cycle. Thanks, Added for 4.11. -- Jens Axboe
Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()
On Sat, 2017-03-25 at 01:38 +0800, Ming Lei wrote: > As I explained, the dying flag should only be mentioned after we change > the code in blk_set_queue_dying(). Hello Ming, If patches 2 and 4 would be combined into a single patch then it wouldn't be necessary anymore to update the comment introduced in patch 2 in patch 4. I think that would make this patch series easier to review. Since the issues fixed by your patches are longstanding issues, have you considered to add a "Cc: stable" tag? Thanks, Bart.
Re: [PATCH] blk-mq: include errors in did_work calculation
On Fri, 2017-03-24 at 11:39 -0600, Jens Axboe wrote: > Currently we return true in blk_mq_dispatch_rq_list() if we queued IO > successfully, but we really want to return whether or not the we made > progress. Progress includes if we got an error return. If we don't, > this can lead to a hang in blk_mq_sched_dispatch_requests() when a > driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of > manually ending the IO in error and return BLK_MQ_QUEUE_OK. Reviewed-by: Bart Van Assche
Re: [PATCH] block: remove bio_clone_bioset_partial()
On 03/24/2017 11:55 AM, Shaohua Li wrote: > commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced > bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is > rewritten by Ming. We don't need the API any more, so revert the commit. > > Jens, > this depends on Ming's patches, so it would be great I put this to md branch. Looks fine to me, feel free to do so. Reviewed-by: Jens Axboe-- Jens Axboe
[PATCH 2/4] nbd: set rq->errors to actual error code
From: Josef BacikWe've been relying on the block layer to assume rq->errors being set translates into -EIO. I noticed in testing that sometimes this isn't true, and really there's not much of a reason to have a counter instead of just using -EIO. So set it properly so we don't leak random numbers to unsuspecting victims. Signed-off-by: Josef Bacik --- drivers/block/nbd.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3d1fc37a..dbc22f4 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -192,7 +192,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); set_bit(NBD_TIMEDOUT, >runtime_flags); - req->errors++; + req->errors = -EIO; mutex_lock(>config_lock); sock_shutdown(nbd); @@ -432,7 +432,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); - req->errors++; + req->errors = -EIO; return cmd; } @@ -448,7 +448,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); - req->errors++; + req->errors = -EIO; return cmd; } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", @@ -518,7 +518,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved) if (!blk_mq_request_started(req)) return; cmd = blk_mq_rq_to_pdu(req); - req->errors++; + req->errors = -EIO; nbd_end_request(cmd); } -- 2.7.4
[PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device()
From: Ratna Manoj BollaWhen a filesystem is mounted on a nbd device and on a disconnect, because of kill_bdev(), and resetting bdev size to zero, buffer_head mappings are getting destroyed under mounted filesystem. After a bdev size reset(i.e bdev->bd_inode->i_size = 0) on a disconnect, followed by a sys_umount(), generic_shutdown_super()->... ->__sync_blockdev()->... -blkdev_writepages()->... ->do_invalidatepage()->... -discard_buffer() is discarding superblock buffer_head assumed to be in mapped state by ext4_commit_super(). [mlin: ported to 4.11-rc2] Signed-off-by: Ratna Manoj Bolla --- drivers/block/nbd.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b0003da..d8a2356 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -126,7 +126,8 @@ static const char *nbdcmd_to_ascii(int cmd) static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev) { - bd_set_size(bdev, 0); + if (bdev->bd_openers <= 1) + bd_set_size(bdev, 0); set_capacity(nbd->disk, 0); kobject_uevent(_to_dev(nbd)->kobj, KOBJ_CHANGE); @@ -665,6 +666,8 @@ static void nbd_reset(struct nbd_device *nbd) static void nbd_bdev_reset(struct block_device *bdev) { + if (bdev->bd_openers > 1) + return; set_device_ro(bdev, false); bdev->bd_inode->i_size = 0; if (max_part > 0) { @@ -728,7 +731,8 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev) { sock_shutdown(nbd); nbd_clear_que(nbd); - kill_bdev(bdev); + + __invalidate_device(bdev, true); nbd_bdev_reset(bdev); /* * We want to give the run thread a chance to wait for everybody -- 2.7.4
Re: [PATCH] blk-mq: include errors in did_work calculation
On Fri, Mar 24, 2017 at 11:39:10AM -0600, Jens Axboe wrote: > Currently we return true in blk_mq_dispatch_rq_list() if we queued IO > successfully, but we really want to return whether or not the we made > progress. Progress includes if we got an error return. If we don't, > this can lead to a hang in blk_mq_sched_dispatch_requests() when a > driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of > manually ending the IO in error and return BLK_MQ_QUEUE_OK. > > Signed-off-by: Jens AxboeReviewed-by: Omar Sandoval
Re: [PATCH] blk-mq: include errors in did_work calculation
> On Mar 24, 2017, at 1:39 PM, Jens Axboewrote: > > Currently we return true in blk_mq_dispatch_rq_list() if we queued IO > successfully, but we really want to return whether or not the we made > progress. Progress includes if we got an error return. If we don't, > this can lead to a hang in blk_mq_sched_dispatch_requests() when a > driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of > manually ending the IO in error and return BLK_MQ_QUEUE_OK. > > Signed-off-by: Jens Axboe > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index a4546f060e80..e3b09abf9d5b 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -978,7 +978,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, > struct list_head *list) > struct request *rq; > LIST_HEAD(driver_list); > struct list_head *dptr; > - int queued, ret = BLK_MQ_RQ_QUEUE_OK; > + int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; > > /* >* Start off with dptr being NULL, so we start the first request > @@ -989,7 +989,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, > struct list_head *list) > /* >* Now process all the entries, sending them to the driver. >*/ > - queued = 0; > + errors = queued = 0; > while (!list_empty(list)) { > struct blk_mq_queue_data bd; > > @@ -1046,6 +1046,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx > *hctx, struct list_head *list) > default: > pr_err("blk-mq: bad return on queue: %d\n", ret); > case BLK_MQ_RQ_QUEUE_ERROR: > + errors++; > rq->errors = -EIO; > blk_mq_end_request(rq, rq->errors); > break; > @@ -1097,7 +1098,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx > *hctx, struct list_head *list) > blk_mq_run_hw_queue(hctx, true); > } > > - return queued != 0; > + return (queued + errors) != 0; > } > > static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > Thanks this fixed it, you can add Tested-by: Josef Bacik Thanks, Josef
[PATCH] block: remove bio_clone_bioset_partial()
commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is rewritten by Ming. We don't need the API any more, so revert the commit. Jens, this depends on Ming's patches, so it would be great I put this to md branch. Cc: Christoph HellwigCc: Jens Axboe Cc: Ming Lei Signed-off-by: Shaohua Li --- block/bio.c | 61 - include/linux/bio.h | 11 ++ 2 files changed, 15 insertions(+), 57 deletions(-) diff --git a/block/bio.c b/block/bio.c index 1ccff0d..0364359 100644 --- a/block/bio.c +++ b/block/bio.c @@ -631,20 +631,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); -static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs, int offset, - int size) +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, +struct bio_set *bs) { struct bvec_iter iter; struct bio_vec bv; struct bio *bio; - struct bvec_iter iter_src = bio_src->bi_iter; - - /* for supporting partial clone */ - if (offset || size != bio_src->bi_iter.bi_size) { - bio_advance_iter(bio_src, _src, offset); - iter_src.bi_size = size; - } /* * Pre immutable biovecs, __bio_clone() used to just do a memcpy from @@ -668,8 +669,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, *__bio_clone_fast() anyways. */ - bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src, - _src), bs); + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; bio->bi_bdev= bio_src->bi_bdev; @@ -686,7 +686,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; break; default: - __bio_for_each_segment(bv, bio_src, iter, iter_src) + bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; break; } @@ -705,44 +705,9 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, return bio; } - -/** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, -struct bio_set *bs) -{ - return __bio_clone_bioset(bio_src, gfp_mask, bs, 0, - bio_src->bi_iter.bi_size); -} EXPORT_SYMBOL(bio_clone_bioset); /** - * bio_clone_bioset_partial - clone a partial bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * @offset: cloned starting from the offset - * @size: size for the cloned bio - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask, -struct bio_set *bs, int offset, -int size) -{ - return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size); -} -EXPORT_SYMBOL(bio_clone_bioset_partial); - -/** * bio_add_pc_page - attempt to add page to bio * @q: the target queue * @bio: destination bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 42b62a0..fafef63 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) -static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) +static inline unsigned bio_segments(struct bio *bio) { unsigned segs = 0; struct bio_vec bv; @@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) break; } -
Re: [PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()
On Sat, Mar 25, 2017 at 1:29 AM, Bart Van Asschewrote: > On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote: >> As the .q_usage_counter is used by both legacy and >> mq path, we need to block new I/O if queue becomes >> dead in blk_queue_enter(). >> >> So rename it and we can use this function in both >> pathes. > > Should "pathes" be changed into "paths" in the commit message? Additionally, > this patch breaks the symmetry the comment in blk_mq_freeze_queue() refers > to. Anyway: Really? Is there one function named as blk_mq_freeze_queue_end()? The comment means blk_mq_freeze_queue() vs. blk_mq_unfreeze_queue(), which can't be affected by this patch. Thanks, Ming Lei
Re: [PATCH v2 4/4] block: block new I/O just after queue is set as dying
On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote: > + /* block new I/O coming */ > + blk_freeze_queue_start(q); As I have already mentioned two times, the comment above blk_freeze_queue_start() should be made more clear. It should mention that without that call blk_queue_enter() won't check the "dying" flag after it has been set. If that is not mentioned in a comment the next person who reads the blk_set_queue_dying() function will wonder why the blk_freeze_queue_start() call is really needed and whether it can be removed. > /* >* read pair of barrier in blk_freeze_queue_start(), >* we need to order reading DEAD flag of .q_usage_counter > - * and reading .mq_freeze_depth, otherwise the following > - * wait may never return if the two read are reordered. > + * and reading .mq_freeze_depth or dying flag, otherwise > + * the following wait may never return if the two read > + * are reordered. >*/ > smp_rmb(); Please fix the spelling in the above comment ("two read"). Thanks, Bart.
[PATCH] blk-mq: include errors in did_work calculation
Currently we return true in blk_mq_dispatch_rq_list() if we queued IO successfully, but we really want to return whether or not the we made progress. Progress includes if we got an error return. If we don't, this can lead to a hang in blk_mq_sched_dispatch_requests() when a driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of manually ending the IO in error and return BLK_MQ_QUEUE_OK. Signed-off-by: Jens Axboediff --git a/block/blk-mq.c b/block/blk-mq.c index a4546f060e80..e3b09abf9d5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -978,7 +978,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) struct request *rq; LIST_HEAD(driver_list); struct list_head *dptr; - int queued, ret = BLK_MQ_RQ_QUEUE_OK; + int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; /* * Start off with dptr being NULL, so we start the first request @@ -989,7 +989,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) /* * Now process all the entries, sending them to the driver. */ - queued = 0; + errors = queued = 0; while (!list_empty(list)) { struct blk_mq_queue_data bd; @@ -1046,6 +1046,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) default: pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: + errors++; rq->errors = -EIO; blk_mq_end_request(rq, rq->errors); break; @@ -1097,7 +1098,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) blk_mq_run_hw_queue(hctx, true); } - return queued != 0; + return (queued + errors) != 0; } static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()
On Sat, Mar 25, 2017 at 1:24 AM, Bart Van Asschewrote: > On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote: >> Without the barrier, reading DEAD flag of .q_usage_counter >> and reading .mq_freeze_depth may be reordered, then the >> following wait_event_interruptible() may never return. >> >> Signed-off-by: Ming Lei >> --- >> block/blk-core.c | 8 >> 1 file changed, 8 insertions(+) >> >> diff --git a/block/blk-core.c b/block/blk-core.c >> index ad388d5e309a..44eed17319c0 100644 >> --- a/block/blk-core.c >> +++ b/block/blk-core.c >> @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool >> nowait) >> if (nowait) >> return -EBUSY; >> >> + /* >> + * read pair of barrier in blk_mq_freeze_queue_start(), >> + * we need to order reading DEAD flag of .q_usage_counter >> + * and reading .mq_freeze_depth, otherwise the following >> + * wait may never return if the two read are reordered. >> + */ >> + smp_rmb(); >> + >> ret = wait_event_interruptible(q->mq_freeze_wq, >> !atomic_read(>mq_freeze_depth) || >> blk_queue_dying(q)); > > Hello Ming, > > The code looks fine to me but the comment not. You probably wanted to refer > to the "dying" flag instead of the "dead" flag? The read order has to be No, looks you misunderstand the issue. I mean the order between reading __PERCPU_REF_DEAD of .q_usage_counter and reading .mq_freeze_depth should be enhanced, especially it is in blk_queue_enter() vs. blk_mq_freeze_queue_start(). In the last patch, you will find the dying flag is mentioned in above comment after we call blk_freeze_queue_start() just after the dying flag is set. > enforced for the "dying" flag and q_usage_counter because of the order in > which these are set by blk_set_queue_dying(). As I explained, the dying flag should only be mentioned after we change the code in blk_set_queue_dying(). Thanks, Ming Lei
Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()
On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote: > Without the barrier, reading DEAD flag of .q_usage_counter > and reading .mq_freeze_depth may be reordered, then the > following wait_event_interruptible() may never return. > > Signed-off-by: Ming Lei> --- > block/blk-core.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/block/blk-core.c b/block/blk-core.c > index ad388d5e309a..44eed17319c0 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait) > if (nowait) > return -EBUSY; > > + /* > + * read pair of barrier in blk_mq_freeze_queue_start(), > + * we need to order reading DEAD flag of .q_usage_counter > + * and reading .mq_freeze_depth, otherwise the following > + * wait may never return if the two read are reordered. > + */ > + smp_rmb(); > + > ret = wait_event_interruptible(q->mq_freeze_wq, > !atomic_read(>mq_freeze_depth) || > blk_queue_dying(q)); Hello Ming, The code looks fine to me but the comment not. You probably wanted to refer to the "dying" flag instead of the "dead" flag? The read order has to be enforced for the "dying" flag and q_usage_counter because of the order in which these are set by blk_set_queue_dying(). Thanks, Bart.
Re: [PATCH v3 02/14] md: move two macros into md.h
On Fri, Mar 24, 2017 at 04:57:37PM +1100, Neil Brown wrote: > On Fri, Mar 17 2017, Ming Lei wrote: > > > Both raid1 and raid10 share common resync > > block size and page count, so move them into md.h. > > I don't think this is necessary. > These are just "magic" numbers. They don't have any real > meaning and so don't belong in md.h, or and .h file. > > Possibly we should find more meaningful numbers, or make them auto-size > or something. I'm also happy for them to stay as they are for now. > But I don't think we should pretend that they are meaningful. I had the same concern when I looked at this patch firstly. The number for raid1/10 doesn't need to be the same. But if we don't move the number to a generic header, the third patch will become a little more complicated. I eventually ignored this issue. If we really need different number for raid1/10, lets do it at that time. I think your suggestion that moving the number to raid1-10.h makes sense, and add a comment declaring the number isn't required to be the same for raid1/10. Thanks, Shaohua
Re: [PATCH v3 08/14] block: introduce bio_copy_data_partial
On 03/16/2017 10:12 AM, Ming Lei wrote: > Turns out we can use bio_copy_data in raid1's write behind, > and we can make alloc_behind_pages() more clean/efficient, > but we need to partial version of bio_copy_data(). > > Signed-off-by: Ming LeiReviewed-by: Jens Axboe Shaohua, feel free to pull this through the md tree, that will be much easier. -- Jens Axboe
Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()
On 03/24/2017 01:36 PM, Ming Lei wrote: > Without the barrier, reading DEAD flag of .q_usage_counter > and reading .mq_freeze_depth may be reordered, then the > following wait_event_interruptible() may never return. > > Signed-off-by: Ming Lei> --- > block/blk-core.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/block/blk-core.c b/block/blk-core.c > index ad388d5e309a..44eed17319c0 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait) > if (nowait) > return -EBUSY; > > + /* > + * read pair of barrier in blk_mq_freeze_queue_start(), > + * we need to order reading DEAD flag of .q_usage_counter > + * and reading .mq_freeze_depth, otherwise the following > + * wait may never return if the two read are reordered. > + */ > + smp_rmb(); > + > ret = wait_event_interruptible(q->mq_freeze_wq, > !atomic_read(>mq_freeze_depth) || > blk_queue_dying(q)); > Reviewed-by: Hannes Reinecke Cheers, Hannes -- Dr. Hannes Reinecke zSeries & Storage h...@suse.de +49 911 74053 688 SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
Re: [PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()
On 03/24/2017 01:36 PM, Ming Lei wrote: > As the .q_usage_counter is used by both legacy and > mq path, we need to block new I/O if queue becomes > dead in blk_queue_enter(). > > So rename it and we can use this function in both > pathes. > > Signed-off-by: Ming Lei> --- > block/blk-core.c | 2 +- > block/blk-mq.c| 10 +- > drivers/block/mtip32xx/mtip32xx.c | 2 +- > drivers/nvme/host/core.c | 2 +- > include/linux/blk-mq.h| 2 +- > 5 files changed, 9 insertions(+), 9 deletions(-) > Reviewed-by: Hannes Reinecke Cheers, Hannes -- Dr. Hannes Reinecke zSeries & Storage h...@suse.de +49 911 74053 688 SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server
On Fri, Mar 24, 2017 at 3:31 PM, Johannes Thumshirnwrote: > On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote: >> >> + >> >> +#define XX(a) case (a): return #a >> > >> > please no macros with retun in them and XX isn't quite too descriptive as >> > well. >> > >> > [...] >> > >> >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) >> >> +{ >> >> + switch (opcode) { >> >> + XX(IB_WC_SEND); >> >> + XX(IB_WC_RDMA_WRITE); >> >> + XX(IB_WC_RDMA_READ); >> >> + XX(IB_WC_COMP_SWAP); >> >> + XX(IB_WC_FETCH_ADD); >> >> + /* recv-side); inbound completion */ >> >> + XX(IB_WC_RECV); >> >> + XX(IB_WC_RECV_RDMA_WITH_IMM); >> >> + default: return "IB_WC_OPCODE_UNKNOWN"; >> >> + } >> >> +} >> > >> > How about: >> > >> > struct { >> > char *name; >> > enum ib_wc_opcode opcode; >> > } ib_wc_opcode_table[] = { >> > { stringyfy(IB_WC_SEND), IB_WC_SEND }, >> > { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE }, >> > { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ } >> > { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP }, >> > { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD }, >> > { stringyfy(IB_WC_RECV), IB_WC_RECV }, >> > { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM }, >> > { NULL, 0 }, >> > }; >> > >> > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) >> > { >> > int i; >> > >> > for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++) >> > if (ib_wc_opcode_table[i].opcode == opcode) >> > return ib_wc_opcode_table[i].name; >> > >> > return "IB_WC_OPCODE_UNKNOWN"; >> > } >> > >> Looks nice, might be better to put it into ib_verbs.h? > > Probably yes, as are your kvec functions for lib/iov_iter.c Thanks, will do in next round! > > [...] > >> > What about resolving the kernel bug instead of making workarounds? >> I tried to send a patch upsteam, but was rejected by Sean. >> http://www.spinics.net/lists/linux-rdma/msg22381.html >> > > I don't see a NACK in this thread. > > From http://www.spinics.net/lists/linux-rdma/msg22410.html: > "The port space (which maps to the service ID) needs to be included as part of > the check that determines the format of the private data, and not simply the > address family." > > After such a state I would have expected to see a v2 of the patch with above > comment addressed. I might busy with other staff at that time, I will check again and revisit the bug. > > Byte, > Johannes > -- > Johannes Thumshirn Storage > jthumsh...@suse.de+49 911 74053 689 > SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg > GF: Felix Imendörffer, Jane Smithard, Graham Norton > HRB 21284 (AG Nürnberg) > Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850 Regards, -- Jack Wang Linux Kernel Developer ProfitBricks GmbH Greifswalder Str. 207 D - 10405 Berlin Tel: +49 30 577 008 042 Fax: +49 30 577 008 299 Email:jinpu.w...@profitbricks.com URL: https://www.profitbricks.de Sitz der Gesellschaft: Berlin Registergericht: Amtsgericht Charlottenburg, HRB 125506 B Geschäftsführer: Achim Weiss
Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server
On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote: > >> + > >> +#define XX(a) case (a): return #a > > > > please no macros with retun in them and XX isn't quite too descriptive as > > well. > > > > [...] > > > >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) > >> +{ > >> + switch (opcode) { > >> + XX(IB_WC_SEND); > >> + XX(IB_WC_RDMA_WRITE); > >> + XX(IB_WC_RDMA_READ); > >> + XX(IB_WC_COMP_SWAP); > >> + XX(IB_WC_FETCH_ADD); > >> + /* recv-side); inbound completion */ > >> + XX(IB_WC_RECV); > >> + XX(IB_WC_RECV_RDMA_WITH_IMM); > >> + default: return "IB_WC_OPCODE_UNKNOWN"; > >> + } > >> +} > > > > How about: > > > > struct { > > char *name; > > enum ib_wc_opcode opcode; > > } ib_wc_opcode_table[] = { > > { stringyfy(IB_WC_SEND), IB_WC_SEND }, > > { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE }, > > { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ } > > { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP }, > > { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD }, > > { stringyfy(IB_WC_RECV), IB_WC_RECV }, > > { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM }, > > { NULL, 0 }, > > }; > > > > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) > > { > > int i; > > > > for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++) > > if (ib_wc_opcode_table[i].opcode == opcode) > > return ib_wc_opcode_table[i].name; > > > > return "IB_WC_OPCODE_UNKNOWN"; > > } > > > Looks nice, might be better to put it into ib_verbs.h? Probably yes, as are your kvec functions for lib/iov_iter.c [...] > > What about resolving the kernel bug instead of making workarounds? > I tried to send a patch upsteam, but was rejected by Sean. > http://www.spinics.net/lists/linux-rdma/msg22381.html > I don't see a NACK in this thread. >From http://www.spinics.net/lists/linux-rdma/msg22410.html: "The port space (which maps to the service ID) needs to be included as part of the check that determines the format of the private data, and not simply the address family." After such a state I would have expected to see a v2 of the patch with above comment addressed. Byte, Johannes -- Johannes Thumshirn Storage jthumsh...@suse.de+49 911 74053 689 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
On Fri, Mar 24, 2017 at 2:31 PM, Bart Van Asschewrote: > On Fri, 2017-03-24 at 13:46 +0100, Jinpu Wang wrote: >> Our IBNBD project was started 3 years ago based on our need for Cloud >> Computing, NVMeOF is a bit younger. >> - IBNBD is one of our components, part of our software defined storage >> solution. >> - As I listed in features, IBNBD has it's own features >> >> We're planning to look more into NVMeOF, but it's not a replacement for >> IBNBD. > > Hello Jack, Danil and Roman, > > Thanks for having taken the time to open source this work and to travel to > Boston to present this work at the Vault conference. However, my > understanding of IBNBD is that this driver has several shortcomings neither > NVMeOF nor iSER nor SRP have: > * Doesn't scale in terms of number of CPUs submitting I/O. The graphs shown > during the Vault talk clearly illustrate this. This is probably the result > of sharing a data structure across all client CPUs, maybe the bitmap that > tracks which parts of the target buffer space are in use. > * Supports IB but none of the other RDMA transports (RoCE / iWARP). > > We also need performance numbers that compare IBNBD against SRP and/or > NVMeOF with memory registration disabled to see whether and how much faster > IBNBD is compared to these two protocols. > > The fact that IBNBD only needs to messages per I/O is an advantage it has > today over SRP but not over NVMeOF nor over iSER. The upstream initiator > drivers for the latter two protocols already support inline data. > > Another question I have is whether integration with multipathd is supported? > If multipathd tries to run scsi_id against an IBNBD client device that will > fail. > > Thanks, > > Bart. Hello Bart, Thanks for your comments. As usual in house driver mainly covers needs for ProfitBricks, We only tested in our hardware environment. We only use IB not RoCE/iWARP. The idea to opensource is : - Present our design/implementation/tradeoff, others might be interested. - Attract more attention from developers/testers, so we can improve the project better and faster. We will gather performance data compare with NVMeOF in next submitting. multipath is not supported, we're using APM for failover. (patch from Mellanox developers) Thanks, -- Jack Wang Linux Kernel Developer ProfitBricks GmbH Greifswalder Str. 207 D - 10405 Berlin Tel: +49 30 577 008 042 Fax: +49 30 577 008 299 Email:jinpu.w...@profitbricks.com URL: https://www.profitbricks.de Sitz der Gesellschaft: Berlin Registergericht: Amtsgericht Charlottenburg, HRB 125506 B Geschäftsführer: Achim Weiss
[GIT PULL] Block fixes for 4.11-rc
Hi Linus, A few fixes for the current series that should go into -rc4. This pull request contains: - A fix for a potential corruption of un-started requests from Ming. - A blk-stat fix from Omar, ensuring we flush the stat batch before checking nr_samples. - A set of fixes from Sagi for the nvmeof family. Please pull! git://git.kernel.dk/linux-block.git for-linus Jens Axboe (1): Merge branch 'nvme-4.11-rc' of git://git.infradead.org/nvme into for-linus Ming Lei (1): blk-mq: don't complete un-started request in timeout handler Omar Sandoval (1): blk-stat: fix blk_stat_sum() if all samples are batched Sagi Grimberg (5): nvme-loop: fix a possible use-after-free when destroying the admin queue nvmet: confirm sq percpu has scheduled and switched to atomic nvmet-rdma: Fix a possible uninitialized variable dereference nvme-rdma: handle cpu unplug when re-establishing the controller nvme-loop: handle cpu unplug when re-establishing the controller block/blk-mq.c | 11 +- block/blk-stat.c| 4 +- drivers/nvme/host/rdma.c| 28 +++--- drivers/nvme/target/core.c | 11 +- drivers/nvme/target/loop.c | 90 + drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/rdma.c | 8 ++-- 7 files changed, 82 insertions(+), 71 deletions(-) -- Jens Axboe
RE: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
> > From: Jack Wang> > This series introduces IBNBD/IBTRS kernel modules. > > IBNBD (InfiniBand network block device) allows for an RDMA transfer of block IO > over InfiniBand network. The driver presents itself as a block device on client > side and transmits the block requests in a zero-copy fashion to the server-side > via InfiniBand. The server part of the driver converts the incoming buffers back > into BIOs and hands them down to the underlying block device. As soon as IO > responses come back from the drive, they are being transmitted back to the > client. Hey Jack, why is this IB specific? Can it work over iWARP transports as well? Steve.
Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
On Fri, 2017-03-24 at 13:46 +0100, Jinpu Wang wrote: > Our IBNBD project was started 3 years ago based on our need for Cloud > Computing, NVMeOF is a bit younger. > - IBNBD is one of our components, part of our software defined storage > solution. > - As I listed in features, IBNBD has it's own features > > We're planning to look more into NVMeOF, but it's not a replacement for IBNBD. Hello Jack, Danil and Roman, Thanks for having taken the time to open source this work and to travel to Boston to present this work at the Vault conference. However, my understanding of IBNBD is that this driver has several shortcomings neither NVMeOF nor iSER nor SRP have: * Doesn't scale in terms of number of CPUs submitting I/O. The graphs shown during the Vault talk clearly illustrate this. This is probably the result of sharing a data structure across all client CPUs, maybe the bitmap that tracks which parts of the target buffer space are in use. * Supports IB but none of the other RDMA transports (RoCE / iWARP). We also need performance numbers that compare IBNBD against SRP and/or NVMeOF with memory registration disabled to see whether and how much faster IBNBD is compared to these two protocols. The fact that IBNBD only needs to messages per I/O is an advantage it has today over SRP but not over NVMeOF nor over iSER. The upstream initiator drivers for the latter two protocols already support inline data. Another question I have is whether integration with multipathd is supported? If multipathd tries to run scsi_id against an IBNBD client device that will fail. Thanks, Bart.
Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server
>> + >> +#define XX(a) case (a): return #a > > please no macros with retun in them and XX isn't quite too descriptive as > well. > > [...] > >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) >> +{ >> + switch (opcode) { >> + XX(IB_WC_SEND); >> + XX(IB_WC_RDMA_WRITE); >> + XX(IB_WC_RDMA_READ); >> + XX(IB_WC_COMP_SWAP); >> + XX(IB_WC_FETCH_ADD); >> + /* recv-side); inbound completion */ >> + XX(IB_WC_RECV); >> + XX(IB_WC_RECV_RDMA_WITH_IMM); >> + default: return "IB_WC_OPCODE_UNKNOWN"; >> + } >> +} > > How about: > > struct { > char *name; > enum ib_wc_opcode opcode; > } ib_wc_opcode_table[] = { > { stringyfy(IB_WC_SEND), IB_WC_SEND }, > { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE }, > { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ } > { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP }, > { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD }, > { stringyfy(IB_WC_RECV), IB_WC_RECV }, > { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM }, > { NULL, 0 }, > }; > > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) > { > int i; > > for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++) > if (ib_wc_opcode_table[i].opcode == opcode) > return ib_wc_opcode_table[i].name; > > return "IB_WC_OPCODE_UNKNOWN"; > } > Looks nice, might be better to put it into ib_verbs.h? > > [...] > >> +/** >> + * struct ibtrs_msg_hdr - Common header of all IBTRS messages >> + * @type:Message type, valid values see: enum ibtrs_msg_types >> + * @tsize: Total size of transferred data >> + * >> + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug. >> + * See IBNBD-610 for details > > What about resolving the kernel bug instead of making workarounds? I tried to send a patch upsteam, but was rejected by Sean. http://www.spinics.net/lists/linux-rdma/msg22381.html > >> + * >> + * DO NOT CHANGE! >> + */ >> +struct ibtrs_msg_hdr { >> + u8 __padding1; >> + u8 type; >> + u16 __padding2; >> + u32 tsize; >> +}; > > [...] > > -- > Johannes Thumshirn Storage > jthumsh...@suse.de+49 911 74053 689 > SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg > GF: Felix Imendörffer, Jane Smithard, Graham Norton > HRB 21284 (AG Nürnberg) > Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850 Thanks Johannes for review. -- Jack Wang Linux Kernel Developer ProfitBricks GmbH Greifswalder Str. 207 D - 10405 Berlin Tel: +49 30 577 008 042 Fax: +49 30 577 008 299 Email:jinpu.w...@profitbricks.com URL: https://www.profitbricks.de Sitz der Gesellschaft: Berlin Registergericht: Amtsgericht Charlottenburg, HRB 125506 B Geschäftsführer: Achim Weiss
Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
On Fri, Mar 24, 2017 at 01:46:02PM +0100, Jinpu Wang wrote: > Hi Johnnes, > > Our IBNBD project was started 3 years ago based on our need for Cloud > Computing, NVMeOF is a bit younger. > - IBNBD is one of our components, part of our software defined storage > solution. > - As I listed in features, IBNBD has it's own features > > We're planning to look more into NVMeOF, but it's not a replacement for IBNBD. Ok thanks for the clarification. Byte, Johannes -- Johannes Thumshirn Storage jthumsh...@suse.de+49 911 74053 689 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
On Fri, Mar 24, 2017 at 1:15 PM, Johannes Thumshirnwrote: > On Fri, Mar 24, 2017 at 11:45:15AM +0100, Jack Wang wrote: >> From: Jack Wang >> >> This series introduces IBNBD/IBTRS kernel modules. >> >> IBNBD (InfiniBand network block device) allows for an RDMA transfer of block >> IO >> over InfiniBand network. The driver presents itself as a block device on >> client >> side and transmits the block requests in a zero-copy fashion to the >> server-side >> via InfiniBand. The server part of the driver converts the incoming buffers >> back >> into BIOs and hands them down to the underlying block device. As soon as IO >> responses come back from the drive, they are being transmitted back to the >> client. >> >> We design and implement this solution based on our need for Cloud Computing, >> the key features are: >> - High throughput and low latency due to: >> 1) Only two rdma messages per IO >> 2) Simplified client side server memory management >> 3) Eliminated SCSI sublayer >> - Simple configuration and handling >> 1) Server side is completely passive: volumes do not need to be >> explicitly exported >> 2) Only IB port GID and device path needed on client side to map >> a block device >> 3) A device can be remapped automatically i.e. after storage >> reboot >> - Pinning of IO-related processing to the CPU of the producer >> >> For usage please refer to Documentation/IBNBD.txt in later patch. >> My colleague Danil Kpnis presents IBNBD in Vault-2017 about our >> design/feature/ >> tradeoff/performance: >> >> http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf >> > > Hi Jack, > > Sorry to ask (I haven't attented the Vault presentation) but why can't you use > NVMe over Fabrics in your environment? From what I see in your presentation > and cover letter, it provides all you need and is in fact a standard Linux and > Windows already have implemented. > > Thanks, > Johannes > -- > Johannes Thumshirn Storage > jthumsh...@suse.de+49 911 74053 689 > SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg > GF: Felix Imendörffer, Jane Smithard, Graham Norton > HRB 21284 (AG Nürnberg) > Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850 Hi Johnnes, Our IBNBD project was started 3 years ago based on our need for Cloud Computing, NVMeOF is a bit younger. - IBNBD is one of our components, part of our software defined storage solution. - As I listed in features, IBNBD has it's own features We're planning to look more into NVMeOF, but it's not a replacement for IBNBD. Thanks, -- Jack Wang Linux Kernel Developer ProfitBricks GmbH Greifswalder Str. 207 D - 10405 Berlin Tel: +49 30 577 008 042 Fax: +49 30 577 008 299 Email:jinpu.w...@profitbricks.com URL: https://www.profitbricks.de Sitz der Gesellschaft: Berlin Registergericht: Amtsgericht Charlottenburg, HRB 125506 B Geschäftsführer: Achim Weiss
[PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()
As the .q_usage_counter is used by both legacy and mq path, we need to block new I/O if queue becomes dead in blk_queue_enter(). So rename it and we can use this function in both pathes. Signed-off-by: Ming Lei--- block/blk-core.c | 2 +- block/blk-mq.c| 10 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/nvme/host/core.c | 2 +- include/linux/blk-mq.h| 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 44eed17319c0..5901133d105f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -670,7 +670,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait) return -EBUSY; /* -* read pair of barrier in blk_mq_freeze_queue_start(), +* read pair of barrier in blk_freeze_queue_start(), * we need to order reading DEAD flag of .q_usage_counter * and reading .mq_freeze_depth, otherwise the following * wait may never return if the two read are reordered. diff --git a/block/blk-mq.c b/block/blk-mq.c index b36f0481ba0e..5370b4f750ff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -68,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, sbitmap_clear_bit(>ctx_map, ctx->index_hw); } -void blk_mq_freeze_queue_start(struct request_queue *q) +void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -78,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) blk_mq_run_hw_queues(q, false); } } -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); +EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { @@ -108,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q) * no blk_unfreeze_queue(), and blk_freeze_queue() is not * exported to drivers as the only user for unfreeze is blk_mq. */ - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } @@ -746,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work) * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in -* blk_mq_freeze_queue_start, and the moment the last request is +* blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ @@ -2376,7 +2376,7 @@ static void blk_mq_queue_reinit_work(void) * take place in parallel. */ list_for_each_entry(q, _q_list, all_q_node) - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); list_for_each_entry(q, _q_list, all_q_node) blk_mq_freeze_queue_wait(q); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f96ab717534c..c96c35ab39df 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd) dev_info(>pdev->dev, "device %s surprise removal\n", dd->disk->disk_name); - blk_mq_freeze_queue_start(dd->queue); + blk_freeze_queue_start(dd->queue); blk_mq_stop_hw_queues(dd->queue); blk_mq_tagset_busy_iter(>tags, mtip_no_dev_cleanup, dd); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9b3b57fef446..4a6d7f408769 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2386,7 +2386,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) mutex_lock(>namespaces_mutex); list_for_each_entry(ns, >namespaces, list) - blk_mq_freeze_queue_start(ns->queue); + blk_freeze_queue_start(ns->queue); mutex_unlock(>namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_start_freeze); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5b3e201c8d4f..ea2e9dcd3aef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); -void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -- 2.9.3
[PATCH v2 2/4] block: add a read barrier in blk_queue_enter()
Without the barrier, reading DEAD flag of .q_usage_counter and reading .mq_freeze_depth may be reordered, then the following wait_event_interruptible() may never return. Signed-off-by: Ming Lei--- block/blk-core.c | 8 1 file changed, 8 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index ad388d5e309a..44eed17319c0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait) if (nowait) return -EBUSY; + /* +* read pair of barrier in blk_mq_freeze_queue_start(), +* we need to order reading DEAD flag of .q_usage_counter +* and reading .mq_freeze_depth, otherwise the following +* wait may never return if the two read are reordered. +*/ + smp_rmb(); + ret = wait_event_interruptible(q->mq_freeze_wq, !atomic_read(>mq_freeze_depth) || blk_queue_dying(q)); -- 2.9.3
[PATCH v2 4/4] block: block new I/O just after queue is set as dying
Before commit 780db2071a(blk-mq: decouble blk-mq freezing from generic bypassing), the dying flag is checked before entering queue, and Tejun converts the checking into .mq_freeze_depth, and assumes the counter is increased just after dying flag is set. Unfortunately we doesn't do that in blk_set_queue_dying(). This patch calls blk_freeze_queue_start() in blk_set_queue_dying(), so that we can block new I/O coming once the queue is set as dying. Given blk_set_queue_dying() is always called in remove path of block device, and queue will be cleaned up later, we don't need to worry about undoing the counter. Cc: Bart Van AsscheCc: Tejun Heo Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei --- block/blk-core.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5901133d105f..f0dd9b0054ed 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -500,6 +500,9 @@ void blk_set_queue_dying(struct request_queue *q) queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(q->queue_lock); + /* block new I/O coming */ + blk_freeze_queue_start(q); + if (q->mq_ops) blk_mq_wake_waiters(q); else { @@ -672,8 +675,9 @@ int blk_queue_enter(struct request_queue *q, bool nowait) /* * read pair of barrier in blk_freeze_queue_start(), * we need to order reading DEAD flag of .q_usage_counter -* and reading .mq_freeze_depth, otherwise the following -* wait may never return if the two read are reordered. +* and reading .mq_freeze_depth or dying flag, otherwise +* the following wait may never return if the two read +* are reordered. */ smp_rmb(); -- 2.9.3
[PATCH v2 1/4] blk-mq: comment on races related with timeout handler
This patch adds comment on two races related with timeout handler: - requeue from queue busy vs. timeout - rq free & reallocation vs. timeout Both the races themselves and current solution aren't explicit enough, so add comments on them. Cc: Bart Van AsscheReviewed-by: Hannes Reinecke Signed-off-by: Ming Lei --- block/blk-mq.c | 22 ++ 1 file changed, 22 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index c212b9644a9f..b36f0481ba0e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -523,6 +523,15 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); +/* + * When we reach here because queue is busy, REQ_ATOM_COMPLETE + * flag isn't set yet, so there may be race with timeout hanlder, + * but given rq->deadline is just set in .queue_rq() under + * this situation, the race won't be possible in reality because + * rq->timeout should be set as big enough to cover the window + * between blk_mq_start_request() called from .queue_rq() and + * clearing REQ_ATOM_STARTED here. + */ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -696,6 +705,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (!test_bit(REQ_ATOM_STARTED, >atomic_flags)) return; + /* +* The rq being checked may have been freed and reallocated +* out already here, we avoid this race by checking rq->deadline +* and REQ_ATOM_COMPLETE flag together: +* +* - if rq->deadline is observed as new value because of +* reusing, the rq won't be timed out because of timing. +* - if rq->deadline is observed as previous value, +* REQ_ATOM_COMPLETE flag won't be cleared in reuse path +* because we put a barrier between setting rq->deadline +* and clearing the flag in blk_mq_start_request(), so +* this rq won't be timed out too. +*/ if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) blk_mq_rq_timed_out(rq, reserved); -- 2.9.3
[PATCH v2 0/4] block: misc changes
Hi, The 1st patch add comments on blk-mq races with timeout handler. The other 3 patches improves handling for dying queue: - the 2nd one adds one barrier in blk_queue_enter() for avoiding hanging caused by out-of-order - the 3rd and 4th patches block new I/O entering queue after queue is set as dying V1: - add comments on races related with timeout handler - add Tested-by & Reviewed-by tag thanks, Ming Ming Lei (4): blk-mq: comment on races related with timeout handler block: add a read barrier in blk_queue_enter() block: rename blk_mq_freeze_queue_start() block: block new I/O just after queue is set as dying block/blk-core.c | 12 block/blk-mq.c| 32 +++- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/nvme/host/core.c | 2 +- include/linux/blk-mq.h| 2 +- 5 files changed, 42 insertions(+), 8 deletions(-) -- 2.9.3
Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server
On Fri, Mar 24, 2017 at 11:45:16AM +0100, Jack Wang wrote: > From: Jack Wang> > Signed-off-by: Jack Wang > Signed-off-by: Kleber Souza > Signed-off-by: Danil Kipnis > Signed-off-by: Roman Pen > --- [...] > + > +#define XX(a) case (a): return #a please no macros with retun in them and XX isn't quite too descriptive as well. [...] > +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) > +{ > + switch (opcode) { > + XX(IB_WC_SEND); > + XX(IB_WC_RDMA_WRITE); > + XX(IB_WC_RDMA_READ); > + XX(IB_WC_COMP_SWAP); > + XX(IB_WC_FETCH_ADD); > + /* recv-side); inbound completion */ > + XX(IB_WC_RECV); > + XX(IB_WC_RECV_RDMA_WITH_IMM); > + default: return "IB_WC_OPCODE_UNKNOWN"; > + } > +} How about: struct { char *name; enum ib_wc_opcode opcode; } ib_wc_opcode_table[] = { { stringyfy(IB_WC_SEND), IB_WC_SEND }, { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE }, { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ } { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP }, { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD }, { stringyfy(IB_WC_RECV), IB_WC_RECV }, { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM }, { NULL, 0 }, }; static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) { int i; for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++) if (ib_wc_opcode_table[i].opcode == opcode) return ib_wc_opcode_table[i].name; return "IB_WC_OPCODE_UNKNOWN"; } [...] > +/** > + * struct ibtrs_msg_hdr - Common header of all IBTRS messages > + * @type:Message type, valid values see: enum ibtrs_msg_types > + * @tsize: Total size of transferred data > + * > + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug. > + * See IBNBD-610 for details What about resolving the kernel bug instead of making workarounds? > + * > + * DO NOT CHANGE! > + */ > +struct ibtrs_msg_hdr { > + u8 __padding1; > + u8 type; > + u16 __padding2; > + u32 tsize; > +}; [...] -- Johannes Thumshirn Storage jthumsh...@suse.de+49 911 74053 689 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
On Fri, Mar 24, 2017 at 11:45:15AM +0100, Jack Wang wrote: > From: Jack Wang> > This series introduces IBNBD/IBTRS kernel modules. > > IBNBD (InfiniBand network block device) allows for an RDMA transfer of block > IO > over InfiniBand network. The driver presents itself as a block device on > client > side and transmits the block requests in a zero-copy fashion to the > server-side > via InfiniBand. The server part of the driver converts the incoming buffers > back > into BIOs and hands them down to the underlying block device. As soon as IO > responses come back from the drive, they are being transmitted back to the > client. > > We design and implement this solution based on our need for Cloud Computing, > the key features are: > - High throughput and low latency due to: > 1) Only two rdma messages per IO > 2) Simplified client side server memory management > 3) Eliminated SCSI sublayer > - Simple configuration and handling > 1) Server side is completely passive: volumes do not need to be > explicitly exported > 2) Only IB port GID and device path needed on client side to map > a block device > 3) A device can be remapped automatically i.e. after storage > reboot > - Pinning of IO-related processing to the CPU of the producer > > For usage please refer to Documentation/IBNBD.txt in later patch. > My colleague Danil Kpnis presents IBNBD in Vault-2017 about our > design/feature/ > tradeoff/performance: > > http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf > Hi Jack, Sorry to ask (I haven't attented the Vault presentation) but why can't you use NVMe over Fabrics in your environment? From what I see in your presentation and cover letter, it provides all you need and is in fact a standard Linux and Windows already have implemented. Thanks, Johannes -- Johannes Thumshirn Storage jthumsh...@suse.de+49 911 74053 689 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
Re: [PATCH 5/8] nowait aio: return on congested block device
On 03/16/2017 09:33 AM, Jens Axboe wrote: > On 03/15/2017 03:51 PM, Goldwyn Rodrigues wrote: >> diff --git a/block/blk-core.c b/block/blk-core.c >> index 0eeb99e..2e5cba2 100644 >> --- a/block/blk-core.c >> +++ b/block/blk-core.c >> @@ -2014,7 +2019,7 @@ blk_qc_t generic_make_request(struct bio *bio) >> do { >> struct request_queue *q = bdev_get_queue(bio->bi_bdev); >> >> -if (likely(blk_queue_enter(q, false) == 0)) { >> +if (likely(blk_queue_enter(q, bio_flagged(bio, BIO_NOWAIT)) == >> 0)) { >> struct bio_list hold; >> struct bio_list lower, same; >> >> @@ -2040,7 +2045,10 @@ blk_qc_t generic_make_request(struct bio *bio) >> bio_list_merge(_list_on_stack, ); >> bio_list_merge(_list_on_stack, ); >> } else { >> -bio_io_error(bio); >> +if (unlikely(bio_flagged(bio, BIO_NOWAIT))) >> +bio_wouldblock_error(bio); >> +else >> +bio_io_error(bio); > > This doesn't look right. What if the queue is dying, and BIO_NOWAIT just > happened to be set? > Yes, I need to add a condition here to check for blk_queue_dying(). Thanks. > And you're missing wbt_wait() as well as a blocking point. Ditto in > blk-mq. wbt_wait() does not apply to WRITE_ODIRECT > >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 159187a..942ce8c 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -1518,6 +1518,8 @@ static blk_qc_t blk_mq_make_request(struct >> request_queue *q, struct bio *bio) >> rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, ); >> if (unlikely(!rq)) { >> __wbt_done(q->rq_wb, wb_acct); >> +if (bio && bio_flagged(bio, BIO_NOWAIT)) >> +bio_wouldblock_error(bio); >> return BLK_QC_T_NONE; >> } >> > > This seems a little fragile now, since not both paths free the bio. > Direct I/O should free the bios in bio_dio_complete(). I am not sure why it would not free bio here originally, but IIRC, this path is for bio==NULL only. So, with this patch we would get a rq==NULL here and hence the bio_wouldblock_error() call. -- Goldwyn
[PATCH 18/28] ibnbd_clt: add sysfs interface
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_client/ibnbd_clt_sysfs.c | 863 +++ drivers/block/ibnbd_client/ibnbd_clt_sysfs.h | 64 ++ 2 files changed, 927 insertions(+) create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_sysfs.c create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_sysfs.h diff --git a/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c b/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c new file mode 100644 index 000..89d487c --- /dev/null +++ b/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c @@ -0,0 +1,863 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include +#include +#include +#include "ibnbd_clt_sysfs.h" +#include "ibnbd_clt.h" +#include +#include + +static struct kobject *ibnbd_kobject; +static struct kobject *ibnbd_devices_kobject; +static DEFINE_MUTEX(sess_lock); + +struct ibnbd_clt_dev_destroy_kobj_work { + struct ibnbd_dev*dev; + struct work_struct work; +}; + +enum { + IBNBD_OPT_ERR = 0, + IBNBD_OPT_SERVER= 1 << 0, + IBNBD_OPT_DEV_PATH = 1 << 1, + IBNBD_OPT_ACCESS_MODE = 1 << 3, + IBNBD_OPT_INPUT_MODE= 1 << 4, + IBNBD_OPT_IO_MODE = 1 << 5, +}; + +static unsigned ibnbd_opt_mandatory[] = { + IBNBD_OPT_SERVER, + IBNBD_OPT_DEV_PATH, +}; + +static const match_table_t ibnbd_opt_tokens = { + { IBNBD_OPT_SERVER, "server=%s" }, + { IBNBD_OPT_DEV_PATH, "device_path=%s"}, + { IBNBD_OPT_ACCESS_MODE, "access_mode=%s"}, + { IBNBD_OPT_INPUT_MODE, "input_mode=%s" }, + { IBNBD_OPT_IO_MODE, "io_mode=%s"}, + { IBNBD_OPT_ERR, NULL}, +}; + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +static int ibnbd_clt_parse_map_options(const char *buf, char *server_addr, + char *pathname, + enum ibnbd_access_mode *access_mode, + enum ibnbd_queue_mode *queue_mode, + enum ibnbd_io_mode *io_mode) +{ + char *options, *sep_opt; + char *p; + substring_t
[PATCH 20/28] ibnbd_clt: add Makefile and Kconfig
From: Jack WangSigned-off-by: Jack Wang --- drivers/block/Kconfig | 2 ++ drivers/block/Makefile | 1 + drivers/block/ibnbd_client/Kconfig | 16 drivers/block/ibnbd_client/Makefile | 5 + 4 files changed, 24 insertions(+) create mode 100644 drivers/block/ibnbd_client/Kconfig create mode 100644 drivers/block/ibnbd_client/Makefile diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f744de7..c309e57 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -275,6 +275,8 @@ config BLK_DEV_CRYPTOLOOP source "drivers/block/drbd/Kconfig" +source "drivers/block/ibnbd_client/Kconfig" + config BLK_DEV_NBD tristate "Network block device support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 1e9661e..7da1813 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ +obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ diff --git a/drivers/block/ibnbd_client/Kconfig b/drivers/block/ibnbd_client/Kconfig new file mode 100644 index 000..162e4e1 --- /dev/null +++ b/drivers/block/ibnbd_client/Kconfig @@ -0,0 +1,16 @@ +config BLK_DEV_IBNBD_CLT + tristate "Network block device over Infiniband client support" + depends on INFINIBAND_IBTRS_CLT + ---help--- + Saying Y here will allow your computer to be a client for network + block devices over Infiniband, i.e. it will be able to use block + devices exported by + servers (mount file systems on them etc.). Communication between + client and server works over Infiniband networking, but to the client + program this is hidden: it looks like a regular local file access to + a block device special file such as /dev/ibnbd0. + + To compile this driver as a module, choose M here: the + module will be called ibnbd_client. + + If unsure, say N. diff --git a/drivers/block/ibnbd_client/Makefile b/drivers/block/ibnbd_client/Makefile new file mode 100644 index 000..bbf211f --- /dev/null +++ b/drivers/block/ibnbd_client/Makefile @@ -0,0 +1,5 @@ + +obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client.o + +ibnbd_client-y := ibnbd_clt.o ibnbd_clt_sysfs.o ../ibnbd_lib/ibnbd.o \ + ../ibnbd_lib/ibnbd-proto.o -- 2.7.4
[PATCH 14/28] ibnbd: add headers shared by ibnbd_client and ibnbd_server
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_inc/ibnbd-proto.h | 273 ++ drivers/block/ibnbd_inc/ibnbd.h | 55 +++ drivers/block/ibnbd_inc/log.h | 68 + 3 files changed, 396 insertions(+) create mode 100644 drivers/block/ibnbd_inc/ibnbd-proto.h create mode 100644 drivers/block/ibnbd_inc/ibnbd.h create mode 100644 drivers/block/ibnbd_inc/log.h diff --git a/drivers/block/ibnbd_inc/ibnbd-proto.h b/drivers/block/ibnbd_inc/ibnbd-proto.h new file mode 100644 index 000..4838177 --- /dev/null +++ b/drivers/block/ibnbd_inc/ibnbd-proto.h @@ -0,0 +1,273 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef __IBNBD_PROTO_H +#define __IBNBD_PROTO_H +#include +#include "ibnbd.h" + +#define IBNBD_VERSION 1 + +#define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 6)) +#if GCC_DIAGNOSTIC_AWARE +#pragma GCC diagnostic push +#pragma GCC diagnostic warning "-Wpadded" +#endif + +/** + * enum ibnbd_msg_types - IBNBD message types + * @IBNBD_MSG_SESS_INFO: initial session info from client to server + * @IBNBD_MSG_SESS_INFO_RSP: initial session info from server to client + * @IBNBD_MSG_OPEN:open connection to ibnbd server instance + * @IBNBD_MSG_OPEN_RSP:response to an @IBNBD_MSG_OPEN + * @IBNBD_MSG_READ:request block device read operation + * @IBNBD_MSG_WRITE: request block device write operation + * @IBNBD_MSG_REVAL: notify client about changed device size + * + * Note: DO NOT REORDER THE MEMBERS OF THIS ENUM! + * If necessary, add new members after the last one. + */ +enum ibnbd_msg_type { + __IBNBD_MSG_MIN, + IBNBD_MSG_SESS_INFO, + IBNBD_MSG_SESS_INFO_RSP, + IBNBD_MSG_OPEN, + IBNBD_MSG_OPEN_RSP, + IBNBD_MSG_IO, + IBNBD_MSG_CLOSE, + IBNBD_MSG_CLOSE_RSP, + IBNBD_MSG_REVAL, + __IBNBD_MSG_MAX +}; + +/** + * struct ibnbd_msg_hdr - header of IBNBD messages + * @type: Message type, valid values see: enum ibnbd_msg_types + */ +struct ibnbd_msg_hdr { + u16 type; + u16 __padding; +}; + +enum ibnbd_access_mode { + IBNBD_ACCESS_RO, + IBNBD_ACCESS_RW, + IBNBD_ACCESS_MIGRATION, +}; + +#define _IBNBD_FILEIO 0 +#define _IBNBD_BLOCKIO 1 +#define _IBNBD_AUTOIO 2 + +enum ibnbd_io_mode { + IBNBD_FILEIO = _IBNBD_FILEIO, + IBNBD_BLOCKIO = _IBNBD_BLOCKIO, + IBNBD_AUTOIO = _IBNBD_AUTOIO, +}; + +/** + * struct ibnbd_msg_sess_info
[PATCH 13/28] ibtrs_srv: add Makefile and Kconfig
From: Jack WangSigned-off-by: Jack Wang --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/ulp/Makefile | 1 + drivers/infiniband/ulp/ibtrs_server/Kconfig | 8 drivers/infiniband/ulp/ibtrs_server/Makefile | 6 ++ 4 files changed, 16 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_server/Kconfig create mode 100644 drivers/infiniband/ulp/ibtrs_server/Makefile diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index cb1b864..07aa050 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -86,6 +86,7 @@ source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/ulp/ibtrs_client/Kconfig" +source "drivers/infiniband/ulp/ibtrs_server/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index acd8ce6..eb4da3f 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ obj-$(CONFIG_INFINIBAND_IBTRS_CLT) += ibtrs_client/ +obj-$(CONFIG_INFINIBAND_IBTRS_SRV) += ibtrs_server/ diff --git a/drivers/infiniband/ulp/ibtrs_server/Kconfig b/drivers/infiniband/ulp/ibtrs_server/Kconfig new file mode 100644 index 000..6fbdc54 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_server/Kconfig @@ -0,0 +1,8 @@ +config INFINIBAND_IBTRS_SRV + tristate "InfiniBand IBTRS SERVER" + depends on INFINIBAND_ADDR_TRANS + ---help--- + Support for the simplified data transfer over InfiniBand. + This offer API to user module IBNBD_SERVER + + The IBTRS protocol is defined by the ProfitBricks GmbH. diff --git a/drivers/infiniband/ulp/ibtrs_server/Makefile b/drivers/infiniband/ulp/ibtrs_server/Makefile new file mode 100644 index 000..39d9e1d --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_server/Makefile @@ -0,0 +1,6 @@ + +obj-$(CONFIG_INFINIBAND_IBTRS_SRV) += ibtrs_server.o + +ibtrs_server-y := ibtrs_srv.o ibtrs_srv_sysfs.o \ + ../ibtrs_lib/ibtrs.o ../ibtrs_lib/ibtrs-proto.o ../ibtrs_lib/iu.o \ + ../ibtrs_lib/heartbeat.o ../ibtrs_lib/common.o -- 2.7.4
[PATCH 15/28] ibnbd: add shared library functions
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_lib/ibnbd-proto.c | 244 ++ drivers/block/ibnbd_lib/ibnbd.c | 108 +++ 2 files changed, 352 insertions(+) create mode 100644 drivers/block/ibnbd_lib/ibnbd-proto.c create mode 100644 drivers/block/ibnbd_lib/ibnbd.c diff --git a/drivers/block/ibnbd_lib/ibnbd-proto.c b/drivers/block/ibnbd_lib/ibnbd-proto.c new file mode 100644 index 000..c6d83f2 --- /dev/null +++ b/drivers/block/ibnbd_lib/ibnbd-proto.c @@ -0,0 +1,244 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include "../ibnbd_inc/ibnbd-proto.h" +#include "../ibnbd_inc/log.h" + +static int ibnbd_validate_msg_sess_info(const struct ibnbd_msg_sess_info *msg, + size_t len) +{ + if (unlikely(len != sizeof(*msg))) { + ERR_NP("Sess info message with unexpected length received" + " %lu instead of %lu\n", len, sizeof(*msg)); + return -EINVAL; + } + + return 0; +} + +static int +ibnbd_validate_msg_sess_info_rsp(const struct ibnbd_msg_sess_info_rsp *msg, +size_t len) +{ + if (unlikely(len != sizeof(*msg))) { + ERR_NP("Sess info message with unexpected length received" + " %lu instead of %lu\n", len, sizeof(*msg)); + return -EINVAL; + } + + return 0; +} + +static int ibnbd_validate_msg_open_resp(const struct ibnbd_msg_open_rsp *msg, + size_t len) +{ + if (unlikely(msg->result)) + return 0; + + if (unlikely(len != sizeof(*msg))) { + ERR_NP("Open Response msg received with unexpected length" + " %zuB instead of %luB\n", len, sizeof(*msg)); + return -EINVAL; + } + + if (unlikely(!msg->logical_block_size)) { + ERR_NP("Open Resp msg received with unexpected with" + " invalid logical_block_size value %d\n", + msg->logical_block_size); + return -EINVAL; + } + + if (unlikely(!msg->physical_block_size)) { + ERR_NP("Open Resp msg received with invalid" + " physical_block_size value %d\n", + msg->physical_block_size); + return -EINVAL; + } + + if (unlikely(!msg->max_hw_sectors)) { + ERR_NP("Open Resp msg received with
[PATCH 11/28] ibtrs_srv: add header shared in ibtrs_server
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- .../ulp/ibtrs_server/ibtrs_srv_internal.h | 201 + 1 file changed, 201 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h new file mode 100644 index 000..79130a1 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h @@ -0,0 +1,201 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef _IBTRS_SRV_INTERNAL_H +#define _IBTRS_SRV_INTERNAL_H + +#include + +enum ssm_state { + SSM_STATE_IDLE, + SSM_STATE_CONNECTED, + SSM_STATE_CLOSING, + SSM_STATE_CLOSED +}; + +/* + * Describes the rdma buffer managed by client and used for his rdma writes + * Rdma info has to be sent in OPEN_RESP message to the client. + */ +struct ibtrs_rcv_buf { + dma_addr_t rdma_addr; + void*buf; +}; + +/* to indicate that memory chunk was not allocated from a N-order contiguous + * pages area + */ +#define IBTRS_MEM_CHUNK_NOORDER -1 + +struct ibtrs_mem_chunk { + struct list_headlist; + int order; + void*addr; +}; + +struct ibtrs_rcv_buf_pool { + struct list_headlist; + struct list_headchunk_list; + struct ibtrs_rcv_buf*rcv_bufs; +}; + +struct ibtrs_stats_wc_comp { + atomic_tmax_wc_cnt; + atomic64_t calls; + atomic64_t total_wc_cnt; +}; + +struct ibtrs_srv_stats_rdma_stats { + atomic64_t cnt_read; + atomic64_t size_total_read; + atomic64_t cnt_write; + atomic64_t size_total_write; + + atomic_tinflight; + atomic64_t inflight_total; +}; + +struct ibtrs_srv_stats_user_ib_msgs { + atomic64_t recv_msg_cnt; + atomic64_t sent_msg_cnt; + atomic64_t recv_size; + atomic64_t sent_size; +}; + +struct ibtrs_srv_stats { + struct ibtrs_srv_stats_rdma_stats rdma_stats; + struct ibtrs_srv_stats_user_ib_msgs user_ib_msgs; + atomic_tapm_cnt; + struct ibtrs_stats_wc_comp wc_comp; +}; + +struct ibtrs_session { + struct list_headlist; + enum ssm_state state; + struct kref kref; + struct workqueue_struct *sm_wq; /* event processing */ + struct
[PATCH 23/28] ibnbd_srv: add abstraction for submit IO to file or block device
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_server/ibnbd_dev.c | 436 + drivers/block/ibnbd_server/ibnbd_dev.h | 149 +++ 2 files changed, 585 insertions(+) create mode 100644 drivers/block/ibnbd_server/ibnbd_dev.c create mode 100644 drivers/block/ibnbd_server/ibnbd_dev.h diff --git a/drivers/block/ibnbd_server/ibnbd_dev.c b/drivers/block/ibnbd_server/ibnbd_dev.c new file mode 100644 index 000..5f6b453 --- /dev/null +++ b/drivers/block/ibnbd_server/ibnbd_dev.c @@ -0,0 +1,436 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include "ibnbd_dev.h" +#include "ibnbd_srv_log.h" + +#define IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS 0 + +struct ibnbd_dev_file_io_work { + struct ibnbd_dev*dev; + void*priv; + + sector_tsector; + void*data; + size_t len; + size_t bi_size; + enum ibnbd_io_flags flags; + + struct work_struct work; +}; + +struct ibnbd_dev_blk_io { + struct ibnbd_dev *dev; + void *priv; +}; + +static struct workqueue_struct *fileio_wq; + +int ibnbd_dev_init(void) +{ + fileio_wq = alloc_workqueue("%s", WQ_UNBOUND, + IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS, + "ibnbd_server_fileio_wq"); + if (!fileio_wq) + return -ENOMEM; + + return 0; +} + +void ibnbd_dev_destroy(void) +{ + destroy_workqueue(fileio_wq); +} + +static inline struct block_device *ibnbd_dev_open_bdev(const char *path, + fmode_t flags) +{ + return blkdev_get_by_path(path, flags, THIS_MODULE); +} + +static int ibnbd_dev_blk_open(struct ibnbd_dev *dev, const char *path, + fmode_t flags) +{ + dev->bdev = ibnbd_dev_open_bdev(path, flags); + return PTR_ERR_OR_ZERO(dev->bdev); +} + +static int ibnbd_dev_vfs_open(struct ibnbd_dev *dev, const char *path, + fmode_t flags) +{ + int oflags = O_DSYNC; /* enable write-through */ + + if (flags & FMODE_WRITE) + oflags |= O_RDWR; + else if (flags & FMODE_READ) + oflags |= O_RDONLY; + else + return -EINVAL; + + dev->file = filp_open(path, oflags, 0); + return PTR_ERR_OR_ZERO(dev->file); +} + +struct ibnbd_dev *ibnbd_dev_open(const char
[PATCH 19/28] ibnbd_clt: add log helpers
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_client/ibnbd_clt_log.h | 79 ++ 1 file changed, 79 insertions(+) create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_log.h diff --git a/drivers/block/ibnbd_client/ibnbd_clt_log.h b/drivers/block/ibnbd_client/ibnbd_clt_log.h new file mode 100644 index 000..b3184b7 --- /dev/null +++ b/drivers/block/ibnbd_client/ibnbd_clt_log.h @@ -0,0 +1,79 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef __IBNBD_CLT_LOG_H__ +#define __IBNBD_CLT_LOG_H__ + +#include "../ibnbd_inc/log.h" + +#define blkdev_name(dev) ((dev->gd == NULL) ? "" : dev->gd->disk_name) + +#define ERR(dev, fmt, ...) pr_err("ibnbd L%d <%s@%s> %s ERR: " fmt,\ + __LINE__, dev->pathname, ibnbd_prefix(dev),\ + blkdev_name(dev), ##__VA_ARGS__) + +#define ERR_RL(dev, fmt, ...) pr_err_ratelimited("ibnbd L%d <%s@%s> %s ERR: "\ + fmt, __LINE__, dev->pathname,\ + ibnbd_prefix(dev), blkdev_name(dev),\ + ##__VA_ARGS__) + +#define WRN(dev, fmt, ...) pr_warn("ibnbd L%d <%s@%s> %s WARN: " fmt,\ + __LINE__, dev->pathname, ibnbd_prefix(dev),\ + blkdev_name(dev), ##__VA_ARGS__) + +#define WRN_RL(dev, fmt, ...) pr_warn_ratelimited("ibnbd L%d <%s@%s> %s WARN: "\ + fmt, __LINE__, dev->pathname, ibnbd_prefix(dev),\ + blkdev_name(dev), ##__VA_ARGS__) + +#define INFO(dev, fmt, ...) pr_info("ibnbd <%s@%s> %s: " \ + fmt, dev->pathname, ibnbd_prefix(dev),\ + blkdev_name(dev), ##__VA_ARGS__) + +#define INFO_RL(dev, fmt, ...) pr_info_ratelimited("ibnbd <%s@%s> %s: " \ + fmt, dev->pathname, ibnbd_prefix(dev),\ + blkdev_name(dev), ##__VA_ARGS__) + +#endif /*__IBNBD_CLT_LOG_H__*/ -- 2.7.4
[PATCH 27/28] ibnbd: add doc for how to use ibnbd and sysfs interface
From: Jack WangSigned-off-by: Jack Wang --- Documentation/IBNBD.txt | 284 1 file changed, 284 insertions(+) create mode 100644 Documentation/IBNBD.txt diff --git a/Documentation/IBNBD.txt b/Documentation/IBNBD.txt new file mode 100644 index 000..f7f490a --- /dev/null +++ b/Documentation/IBNBD.txt @@ -0,0 +1,284 @@ +Infiniband Network Block Device (IBNBD) +=== + +Introduction + + +IBNBD (InfiniBand Network Block Device) is a pair of kernel modules (client and +server) that allows to access a remote storage device on the server from +clients via an InfiniBand network. +Mapped storage devices appear transparent for the client, acting as any other +regular storage devices. + +The data transport between client and server over the InfiniBand network +is performed by the IBTRS (InfiniBand Transport) kernel modules. + +The administration of these modules is done via sysfs. A Command-line tool +(ibnbd-cli) is also available for a more user-friendly experience. + +Requirements + + - IBTRS kernel modules (available as git-submodule) + +Quick Start +--- +Server: + # insmod ibtrs/ibtrs_server/ibtrs_server.ko + # insmod ibnbd_server/ibnbd_server.ko + +Client: + # insmod ibtrs/ibtrs_client/ibtrs_client.ko + # insmod ibnbd_client/ibnbd_client.ko + # echo "server= device_path=" > /sys/kernel/ibnbd/map_device + +The block device will become available on the client as +/dev/ibnbd. It can be used like a local block device. + +Client Userspace Interface +-- +This chapter describes only the most important files of Userspace Interface. +A full documentation can be found in the Architecture Documentation. + +All sysfs files that are not read-only will return a usage information if they +are read. + +example: + $ cat /sys/kernel/ibnbd/map_device + + +/sys/kernel/ibnbd/ entries +~~ + +map_device (RW) +^^^ +To map a volume on the client, information about the device has to be written +to: + /sys/kernel/ibnbd/map_device + +The format of the input is: + "server= device_path= + [access_mode= /sys/kernel/ibnbd/map_device + # echo "server=ip:10.50.100.64 device_path=3F2504E0-4F89-41D3-9A0C-0305E82C3301" > /sys/kernel/ibnbd/map_device + +Finding device file after mapping ++ +After mapping, the device file can be found by: +1.) The symlink /sys/kernel/ibnbd/devices/ points to +/sys/block/. +The last part of the symlink destination is the same than the device name. +By extracting the last part of the path the path to the device +/dev/ can be build. +2.) /dev/block/$(cat /sys/kernel/ibnbd/devices//dev) + +How to find the of the device is described on the next chapter +(devices/ directory). + +devices/ (DIRECTORY)
[PATCH 28/28] MAINTRAINERS: Add maintainer for IBNBD/IBTRS
From: Jack WangSigned-off-by: Jack Wang --- MAINTAINERS | 14 ++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c776906..12a528a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6263,6 +6263,20 @@ IBM ServeRAID RAID DRIVER S: Orphan F: drivers/scsi/ips.* +IBTRS TRANSPORT DRIVERS +M: Jack Wang +L: linux-r...@vger.kernel.org +S: Maintained +F: include/linux/ibtrs*.h +F: drivers/infiniband/ulp/ibtrs* + +IBNBD BLOCK DRIVERS +M: Jack Wang +L: linux-r...@vger.kernel.org +S: Maintained +F: Documentation/IBNBD.txt +F: drivers/block/ibnbd* + ICH LPC AND GPIO DRIVER M: Peter Tyser S: Maintained -- 2.7.4
[PATCH 25/28] ibnbd_srv: add sysfs interface
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_server/ibnbd_srv_sysfs.c | 317 +++ drivers/block/ibnbd_server/ibnbd_srv_sysfs.h | 64 ++ 2 files changed, 381 insertions(+) create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_sysfs.c create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_sysfs.h diff --git a/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c b/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c new file mode 100644 index 000..8774abe --- /dev/null +++ b/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c @@ -0,0 +1,317 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "../ibnbd_inc/ibnbd.h" +#include "ibnbd_srv.h" +#include "ibnbd_srv_log.h" +#include "ibnbd_srv_sysfs.h" + +static struct kobject *ibnbd_srv_kobj; +static struct kobject *ibnbd_srv_devices_kobj; +#define IBNBD_SYSFS_DIR "ibnbd" +static char ibnbd_sysfs_dir[64] = IBNBD_SYSFS_DIR; + +static ssize_t ibnbd_srv_revalidate_dev_show(struct kobject *kobj, +struct kobj_attribute *attr, +char *page) +{ + return scnprintf(page, PAGE_SIZE, +"Usage: echo 1 > %s\n", attr->attr.name); +} + +static ssize_t ibnbd_srv_revalidate_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + struct ibnbd_srv_dev *dev = container_of(kobj, struct ibnbd_srv_dev, +dev_kobj); + + if (!sysfs_streq(buf, "1")) { + ERR_NP("%s: invalid value: '%s'\n", attr->attr.name, buf); + return -EINVAL; + } + ret = ibnbd_srv_revalidate_dev(dev); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute ibnbd_srv_revalidate_dev_attr = + __ATTR(revalidate, + 0644, + ibnbd_srv_revalidate_dev_show, + ibnbd_srv_revalidate_dev_store); + +static struct attribute *ibnbd_srv_default_dev_attrs[] = { + _srv_revalidate_dev_attr.attr, + NULL, +}; + +static struct attribute_group ibnbd_srv_default_dev_attr_group = { + .attrs =
[PATCH 21/28] ibnbd_srv: add header shared in ibnbd_server
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_server/ibnbd_srv.h | 115 + 1 file changed, 115 insertions(+) create mode 100644 drivers/block/ibnbd_server/ibnbd_srv.h diff --git a/drivers/block/ibnbd_server/ibnbd_srv.h b/drivers/block/ibnbd_server/ibnbd_srv.h new file mode 100644 index 000..764a31f --- /dev/null +++ b/drivers/block/ibnbd_server/ibnbd_srv.h @@ -0,0 +1,115 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef _IBNBD_SRV_H +#define _IBNBD_SRV_H + +#include +#include +#include +#include "../ibnbd_inc/ibnbd.h" +#include "../ibnbd_inc/ibnbd-proto.h" +#include + +enum sess_state { + SESS_STATE_CONNECTED, + SESS_STATE_DISCONNECTED +}; + +struct ibnbd_srv_session { + struct list_headlist; /* for the global sess_list */ + struct ibtrs_session*ibtrs_sess; + charstr_addr[IBTRS_ADDRLEN]; + charhostname[MAXHOSTNAMELEN]; + int queue_depth; + enum sess_state state; + struct bio_set *sess_bio_set; + + rwlock_tindex_lock cacheline_aligned; + struct idr index_idr; + struct mutexlock; /* protects sess_dev_list */ + struct list_headsess_dev_list; /* list of struct ibnbd_srv_sess_dev */ + u8 ver; /* IBNBD protocol version */ +}; + +struct ibnbd_srv_dev { + struct list_headlist; /* global dev_list */ + + struct kobject dev_kobj; + struct kobject dev_clients_kobj; + + struct kref kref; + charid[NAME_MAX]; + + struct mutexlock; /* protects sess_dev_list and open_write_cnt */ + struct list_headsess_dev_list; /* list of struct ibnbd_srv_sess_dev */ + int open_write_cnt; + enum ibnbd_io_mode mode; +}; + +struct ibnbd_srv_sess_dev { + struct list_headdev_list; /* for struct ibnbd_srv_dev->sess_dev_list */ + struct list_headsess_list; /* for struct ibnbd_srv_session->sess_dev_list */ + + struct ibnbd_dev*ibnbd_dev; + struct ibnbd_srv_session*sess; + struct ibnbd_srv_dev*dev; + struct kobject kobj; + struct completion
[PATCH 17/28] ibnbd_clt: add header shared in ibnbd_client
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_client/ibnbd_clt.h | 231 + 1 file changed, 231 insertions(+) create mode 100644 drivers/block/ibnbd_client/ibnbd_clt.h diff --git a/drivers/block/ibnbd_client/ibnbd_clt.h b/drivers/block/ibnbd_client/ibnbd_clt.h new file mode 100644 index 000..3f0db78 --- /dev/null +++ b/drivers/block/ibnbd_client/ibnbd_clt.h @@ -0,0 +1,231 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef _IBNBD_CLT_H +#define _IBNBD_CLT_H +#include +#include /* for wait_queue_head_t */ +#include /* for sockaddr_in */ +#include /* for sockaddr_in */ +#include +#include "ibnbd_clt_log.h" +#include "../ibnbd_inc/ibnbd.h" +#include "../ibnbd_inc/ibnbd-proto.h" /* ibnbd protocol messages */ +#include /* for ibtrs api */ +#include + +#define IP_PREFIX "ip:" +#define IP_PREFIX_LEN strlen(IP_PREFIX) +#define GID_PREFIX "gid:" +#define GID_PREFIX_LEN strlen(GID_PREFIX) + +#define BMAX_SEGMENTS 31 +#define RECONNECT_DELAY 30 +#define MAX_RECONNECTS -1 + +enum ibnbd_dev_state { + DEV_STATE_INIT, + DEV_STATE_INIT_CLOSED, + DEV_STATE_CLOSED, + DEV_STATE_UNMAPPED, + DEV_STATE_OPEN +}; + +enum ibnbd_queue_mode { + BLK_MQ, + BLK_RQ +}; + +struct ibnbd_iu { + struct request *rq; + struct ibtrs_tag*tag; + struct ibnbd_dev*dev; + struct ibnbd_msg_io msg; + int errno; + struct scatterlist sglist[BMAX_SEGMENTS]; +}; + +struct ibnbd_cpu_qlist { + struct list_headrequeue_list; + spinlock_t requeue_lock; + unsigned intcpu; +}; + +enum sess_state { + SESS_STATE_READY, + SESS_STATE_DISCONNECTED, + SESS_STATE_DESTROYED, +}; + +struct ibnbd_session { + struct list_headlist; + struct ibtrs_session*sess; + struct ibnbd_cpu_qlist __percpu + *cpu_queues; + DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); + int __percpu*cpu_rr; /* per-cpu var for CPU round-robin */ + atomic_tbusy; + int queue_depth; + u32 max_io_size; + struct blk_mq_tag_set tag_set; + struct mutexlock; /* protects state and devs_list */ + struct list_headdevs_list; /* list of struct ibnbd_dev */ +
[PATCH 26/28] ibnbd_srv: add Makefile and Kconfig
From: Jack WangSigned-off-by: Jack Wang --- drivers/block/Kconfig | 1 + drivers/block/Makefile | 1 + drivers/block/ibnbd_server/Kconfig | 16 drivers/block/ibnbd_server/Makefile | 3 +++ 4 files changed, 21 insertions(+) create mode 100644 drivers/block/ibnbd_server/Kconfig create mode 100644 drivers/block/ibnbd_server/Makefile diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index c309e57..e4823c4 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -276,6 +276,7 @@ config BLK_DEV_CRYPTOLOOP source "drivers/block/drbd/Kconfig" source "drivers/block/ibnbd_client/Kconfig" +source "drivers/block/ibnbd_server/Kconfig" config BLK_DEV_NBD tristate "Network block device support" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 7da1813..cd20888 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client/ +obj-$(CONFIG_BLK_DEV_IBNBD_SRV)+= ibnbd_server/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ diff --git a/drivers/block/ibnbd_server/Kconfig b/drivers/block/ibnbd_server/Kconfig new file mode 100644 index 000..943e1b2 --- /dev/null +++ b/drivers/block/ibnbd_server/Kconfig @@ -0,0 +1,16 @@ +config BLK_DEV_IBNBD_SRV + tristate "Network block device over Infiniband server support" + depends on INFINIBAND_IBTRS_SRV + ---help--- + Saying Y here will allow your computer to be a server for network + block devices over Infiniband, i.e. it will be able to use block + devices exported by servers (mount file systems on them etc.). + Communication between client and server works over Infiniband + networking, but to the client program this is hidden: + it looks like a regular local file access to a block device + special file such as /dev/ibnbd0. + + To compile this driver as a module, choose M here: the + module will be called ibnbd_client. + + If unsure, say N. diff --git a/drivers/block/ibnbd_server/Makefile b/drivers/block/ibnbd_server/Makefile new file mode 100644 index 000..e66860f --- /dev/null +++ b/drivers/block/ibnbd_server/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_BLK_DEV_IBNBD_SRV) += ibnbd_server.o +ibnbd_server-objs := ibnbd_srv.o ibnbd_srv_sysfs.o ibnbd_dev.o \ + ../ibnbd_lib/ibnbd.o ../ibnbd_lib/ibnbd-proto.o -- 2.7.4
[PATCH 24/28] ibnbd_srv: add log helpers
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis --- drivers/block/ibnbd_server/ibnbd_srv_log.h | 69 ++ 1 file changed, 69 insertions(+) create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_log.h diff --git a/drivers/block/ibnbd_server/ibnbd_srv_log.h b/drivers/block/ibnbd_server/ibnbd_srv_log.h new file mode 100644 index 000..9217804 --- /dev/null +++ b/drivers/block/ibnbd_server/ibnbd_srv_log.h @@ -0,0 +1,69 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef __IBNBD_SRV_LOG_H__ +#define __IBNBD_SRV_LOG_H__ + +#include "../ibnbd_inc/log.h" + +#define ERR(dev, fmt, ...) pr_err("ibnbd L%d <%s@%s> ERR: " fmt, \ + __LINE__, dev->pathname, ibnbd_prefix(dev),\ + ##__VA_ARGS__) +#define ERR_RL(dev, fmt, ...) pr_err_ratelimited("ibnbd L%d <%s@%s> ERR: " fmt,\ + __LINE__, dev->pathname, ibnbd_prefix(dev),\ + ##__VA_ARGS__) +#define WRN(dev, fmt, ...) pr_warn("ibnbd L%d <%s@%s> WARN: " fmt,\ + __LINE__, dev->pathname, ibnbd_prefix(dev),\ + ##__VA_ARGS__) +#define WRN_RL(dev, fmt, ...) pr_warn_ratelimited("ibnbd L%d <%s@%s> WARN: " \ + fmt, __LINE__, dev->pathname, ibnbd_prefix(dev),\ + ##__VA_ARGS__) +#define INFO(dev, fmt, ...) pr_info("ibnbd <%s@%s>: " \ + fmt, dev->pathname, ibnbd_prefix(dev), ##__VA_ARGS__) +#define INFO_RL(dev, fmt, ...) pr_info_ratelimited("ibnbd <%s@%s>: " \ + fmt, dev->pathname, ibnbd_prefix(dev), ##__VA_ARGS__) + +#endif /*__IBNBD_SRV_LOG_H__*/ -- 2.7.4
[PATCH 22/28] ibnbd_srv: add main functionality
From: Jack WangProcess incoming IO from ibtrs server, and hands them down to underlying block device. Signed-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/block/ibnbd_server/ibnbd_srv.c | 1074 1 file changed, 1074 insertions(+) create mode 100644 drivers/block/ibnbd_server/ibnbd_srv.c diff --git a/drivers/block/ibnbd_server/ibnbd_srv.c b/drivers/block/ibnbd_server/ibnbd_srv.c new file mode 100644 index 000..13832b6 --- /dev/null +++ b/drivers/block/ibnbd_server/ibnbd_srv.c @@ -0,0 +1,1074 @@ +/* + * InfiniBand Network Block Driver + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include +#include +#include +#include "../ibnbd_inc/ibnbd-proto.h" +#include +#include "../ibnbd_inc/ibnbd.h" +#include "ibnbd_srv.h" +#include "ibnbd_srv_log.h" +#include "ibnbd_srv_sysfs.h" +#include "ibnbd_dev.h" + +MODULE_AUTHOR("ib...@profitbricks.com"); +MODULE_VERSION(__stringify(IBNBD_VER)); +MODULE_DESCRIPTION("InfiniBand Network Block Device Server"); +MODULE_LICENSE("GPL"); + +#define DEFAULT_DEV_SEARCH_PATH "/" + +static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH; + +static int dev_search_path_set(const char *val, const struct kernel_param *kp) +{ + char *dup; + + if (strlen(val) >= sizeof(dev_search_path)) + return -EINVAL; + + dup = kstrdup(val, GFP_KERNEL); + + if (dup[strlen(dup) - 1] == '\n') + dup[strlen(dup) - 1] = '\0'; + + strlcpy(dev_search_path, dup, sizeof(dev_search_path)); + + kfree(dup); + INFO_NP("dev_search_path changed to '%s'\n", dev_search_path); + + return 0; +} + +static struct kparam_string dev_search_path_kparam_str = { + .maxlen = sizeof(dev_search_path), + .string = dev_search_path +}; + +static const struct kernel_param_ops dev_search_path_ops = { + .set= dev_search_path_set, + .get= param_get_string, +}; + +module_param_cb(dev_search_path, _search_path_ops, + _search_path_kparam_str, 0444); +MODULE_PARM_DESC(dev_search_path, "Sets the device_search_path." +" When a device is mapped this path is prepended to the" +" device_path from the map_device operation." +" (default: " DEFAULT_DEV_SEARCH_PATH ")"); + +static int def_io_mode = IBNBD_BLOCKIO; +module_param(def_io_mode, int, 0444); +MODULE_PARM_DESC(def_io_mode, "By default, export devices in" +" blockio(" __stringify(_IBNBD_BLOCKIO) ") or" +" fileio(" __stringify(_IBNBD_FILEIO) ") mode." +
[PATCH 09/28] ibtrs_srv: add header file for exported interface
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- include/rdma/ibtrs_srv.h | 206 +++ 1 file changed, 206 insertions(+) create mode 100644 include/rdma/ibtrs_srv.h diff --git a/include/rdma/ibtrs_srv.h b/include/rdma/ibtrs_srv.h new file mode 100644 index 000..dbd535f --- /dev/null +++ b/include/rdma/ibtrs_srv.h @@ -0,0 +1,206 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef _IBTRS_SRV_H +#define _IBTRS_SRV_H + +#include + +struct ibtrs_session; +struct ibtrs_ops_id; + +enum ibtrs_srv_rdma_ev { + IBTRS_SRV_RDMA_EV_RECV, + IBTRS_SRV_RDMA_EV_WRITE_REQ, +}; + +/** + * enum ibtrs_srv_sess_ev - Session events + * @IBTRS_SRV_SESS_EV_CONNECTED: Connection from client established + * @IBTRS_SRV_SESS_EV_DISCONNECTING: Connection is currently disconnected, + * sending data through the connection may + * fail, but could still recv messages. + * @IBTRS_SRV_SESS_EV_DISCONNECTED:Connection was disconnected, all + * connection IBTRS resources were freed. + */ + +enum ibtrs_srv_sess_ev { + IBTRS_SRV_SESS_EV_CONNECTED, + IBTRS_SRV_SESS_EV_DISCONNECTING, + IBTRS_SRV_SESS_EV_DISCONNECTED, +}; + +/** + * ibtrs_srv_ops - Callbacks for ibtrs_server + * @owner: module that uses ibtrs_server + * @rdma_ev: Event notification for RDMA operations + * If the callback returns a value != 0, an error message + * for the data transfer will be sent to the client. + + * @sess: Session + * @priv: Private data from user + * @id:internal IBTRS id + * @ev:Event + * @data: Data received by the client. The message of the user of + * ibtrs_client is allocated at the end of the buffer. + * Before the message the data of the ibtrs_client is + * located. + * If the event is %IBTRS_SRV_RDMA_EV_WRITE_REQ, the user + * can write his response into @data. When + * ibtrs_srv_resp_rdma() is called, this @data will be + * transferred to the client. + * @len: length of data in @data + + * @sess_ev: Events about connective state changes + * If the callback returns != 0 and the event + *
[PATCH 10/28] ibtrs_srv: add main functionality for ibtrs_server
From: Jack WangService accept connection requests from clients and reserve memory for them. It excutes rdma transfers, hands over received data to ibnbd_server. Signed-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c | 3744 +++ 1 file changed, 3744 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c new file mode 100644 index 000..513e90a --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c @@ -0,0 +1,3744 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ibtrs_srv_sysfs.h" +#include "ibtrs_srv_internal.h" +#include +#include + +MODULE_AUTHOR("ib...@profitbricks.com"); +MODULE_DESCRIPTION("InfiniBand Transport Server"); +MODULE_VERSION(__stringify(IBTRS_VER)); +MODULE_LICENSE("GPL"); + +#define DEFAULT_MAX_IO_SIZE_KB 128 +#define DEFAULT_MAX_IO_SIZE (DEFAULT_MAX_IO_SIZE_KB * 1024) +static int max_io_size = DEFAULT_MAX_IO_SIZE; +#define MAX_REQ_SIZE PAGE_SIZE +static int rcv_buf_size = DEFAULT_MAX_IO_SIZE + MAX_REQ_SIZE; + +static int max_io_size_set(const char *val, const struct kernel_param *kp) +{ + int err, ival; + + err = kstrtoint(val, 0, ); + if (err) + return err; + + if (ival < 4096 || ival + MAX_REQ_SIZE > (4096 * 1024) || + (ival + MAX_REQ_SIZE) % 512 != 0) { + ERR_NP("Invalid max io size value %d, has to be" + " > %d, < %d\n", ival, 4096, 4194304); + return -EINVAL; + } + + max_io_size = ival; + rcv_buf_size = max_io_size + MAX_REQ_SIZE; + INFO_NP("max io size changed to %d\n", ival); + + return 0; +} + +static const struct kernel_param_ops max_io_size_ops = { + .set= max_io_size_set, + .get= param_get_int, +}; +module_param_cb(max_io_size, _io_size_ops, _io_size, 0444); +MODULE_PARM_DESC(max_io_size, +"Max size for each IO request, when change the unit is in byte" +" (default: " __stringify(DEFAULT_MAX_IO_SIZE_KB) "KB)"); + +#define DEFAULT_SESS_QUEUE_DEPTH 512 +static int sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH; +module_param_named(sess_queue_depth, sess_queue_depth, int, 0444); +MODULE_PARM_DESC(sess_queue_depth, +"Number of buffers for
[PATCH 08/28] ibtrs_clt: add Makefile and Kconfig
From: Jack WangSigned-off-by: Jack Wang --- drivers/infiniband/Kconfig | 2 ++ drivers/infiniband/ulp/Makefile | 1 + drivers/infiniband/ulp/ibtrs_client/Kconfig | 8 drivers/infiniband/ulp/ibtrs_client/Makefile | 6 ++ 4 files changed, 17 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_client/Kconfig create mode 100644 drivers/infiniband/ulp/ibtrs_client/Makefile diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 66f8602..cb1b864 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,6 +85,8 @@ source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" +source "drivers/infiniband/ulp/ibtrs_client/Kconfig" + source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index f3c7dcf..acd8ce6 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_INFINIBAND_SRP)+= srp/ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ +obj-$(CONFIG_INFINIBAND_IBTRS_CLT) += ibtrs_client/ diff --git a/drivers/infiniband/ulp/ibtrs_client/Kconfig b/drivers/infiniband/ulp/ibtrs_client/Kconfig new file mode 100644 index 000..3cf0728 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_client/Kconfig @@ -0,0 +1,8 @@ +config INFINIBAND_IBTRS_CLT + tristate "InfiniBand IBTRS CLIENT" + depends on INFINIBAND_ADDR_TRANS + ---help--- + Support for the simplified data transfer over InfiniBand. + This offer API to user module IBNBD_CLIENT + + The IBTRS protocol is defined by the ProfitBricks GmbH. diff --git a/drivers/infiniband/ulp/ibtrs_client/Makefile b/drivers/infiniband/ulp/ibtrs_client/Makefile new file mode 100644 index 000..d0fb226 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_client/Makefile @@ -0,0 +1,6 @@ + +obj-$(CONFIG_INFINIBAND_IBTRS_CLT) += ibtrs_client.o + +ibtrs_client-y := ibtrs_clt.o ibtrs_clt_sysfs.o \ + ../ibtrs_lib/ibtrs.o ../ibtrs_lib/ibtrs-proto.o ../ibtrs_lib/iu.o \ + ../ibtrs_lib/heartbeat.o ../ibtrs_lib/common.o -- 2.7.4
[PATCH 12/28] ibtrs_srv: add sysfs interface
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- .../infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c | 301 + .../infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.h | 59 2 files changed, 360 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.h diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c new file mode 100644 index 000..c95a124 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c @@ -0,0 +1,301 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include "ibtrs_srv_sysfs.h" +#include "ibtrs_srv_internal.h" +#include +#include +#include + +static struct kobject *ibtrs_srv_kobj; +static struct kobject *ibtrs_srv_sessions_kobj; + +static ssize_t ibtrs_srv_hb_timeout_show(struct kobject *kobj, +struct kobj_attribute *attr, +char *page) +{ + struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, + kobj); + + return scnprintf(page, PAGE_SIZE, "%u\n", sess->heartbeat.timeout_ms); +} + +static ssize_t ibtrs_srv_hb_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + u32 timeout_ms; + struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, + kobj); + + ret = kstrtouint(buf, 0, _ms); + if (ret) + return ret; + + ret = ibtrs_heartbeat_timeout_validate(timeout_ms); + if (ret) + return ret; + + INFO(sess, "%s: changing value from %u to %u\n", attr->attr.name, +sess->heartbeat.timeout_ms, timeout_ms); + ibtrs_set_heartbeat_timeout(>heartbeat, timeout_ms); + return count; +} + +static struct kobj_attribute ibtrs_srv_heartbeat_timeout_ms_attr = + __ATTR(heartbeat_timeout_ms, 0644, + ibtrs_srv_hb_timeout_show, ibtrs_srv_hb_timeout_store); + +static ssize_t ibtrs_srv_disconnect_show(struct kobject *kobj, +struct kobj_attribute *attr, +char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", +
[PATCH 07/28] ibtrs_clt: add files for sysfs interface
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- .../infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c | 412 + .../infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.h | 62 2 files changed, 474 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.h diff --git a/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c new file mode 100644 index 000..d430af0 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c @@ -0,0 +1,412 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include "ibtrs_clt_internal.h" +#include +#include "ibtrs_clt_sysfs.h" +#include +#include +#include + +static struct kobject *sessions_kobj; +static struct kobject *ibtrs_kobj; + +#define MIN_MAX_RECONN_ATT -1 +#define MAX_MAX_RECONN_ATT + +static ssize_t ibtrs_clt_max_reconn_attempts_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, + kobj); + + return sprintf(page, "%d\n", + ibtrs_clt_get_max_reconnect_attempts(sess)); +} + +static ssize_t ibtrs_clt_max_reconn_attempts_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + int ret; + s16 value; + struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, + kobj); + + ret = kstrtos16(buf, 10, ); + if (unlikely(ret)) { + ERR(sess, "%s: failed to convert string '%s' to int\n", + attr->attr.name, buf); + return ret; + } + if (unlikely(value > MAX_MAX_RECONN_ATT || +value < MIN_MAX_RECONN_ATT)) { + ERR(sess, "%s: invalid range" + " (provided: '%s', accepted: min: %d, max: %d)\n", + attr->attr.name, buf, MIN_MAX_RECONN_ATT, + MAX_MAX_RECONN_ATT); + return -EINVAL; + } + + INFO(sess, "%s: changing value from %d to %d\n",
[PATCH 03/28] ibtrs_lib: add common functions shared by client and server
From: Jack WangThese files define functions used by both client and server, eg validate protocol message, heartbeat helpers, etc. Signed-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- drivers/infiniband/ulp/ibtrs_lib/common.c | 104 +++ drivers/infiniband/ulp/ibtrs_lib/heartbeat.c | 112 +++ drivers/infiniband/ulp/ibtrs_lib/ibtrs-proto.c | 248 +++ drivers/infiniband/ulp/ibtrs_lib/ibtrs.c | 412 + drivers/infiniband/ulp/ibtrs_lib/iu.c | 113 +++ 5 files changed, 989 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_lib/common.c create mode 100644 drivers/infiniband/ulp/ibtrs_lib/heartbeat.c create mode 100644 drivers/infiniband/ulp/ibtrs_lib/ibtrs-proto.c create mode 100644 drivers/infiniband/ulp/ibtrs_lib/ibtrs.c create mode 100644 drivers/infiniband/ulp/ibtrs_lib/iu.c diff --git a/drivers/infiniband/ulp/ibtrs_lib/common.c b/drivers/infiniband/ulp/ibtrs_lib/common.c new file mode 100644 index 000..81affa7 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_lib/common.c @@ -0,0 +1,104 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#include +#include +#include + +u64 timediff_cur_ms(u64 cur_ms) +{ + struct timespec cur = CURRENT_TIME; + struct timespec ts = ns_to_timespec(cur_ms * NSEC_PER_MSEC); + + if (timespec_compare(, ) < 0) + return timespec_to_ms() - timespec_to_ms(); + else + return timespec_to_ms() - timespec_to_ms(); +} + +/* + * ibtrs_malloc() - allocate kernel or virtual memory + * @size: size to be allocated + * + * The pointer returned must be freed with kvfree() + */ +void *ibtrs_malloc(size_t size) +{ + void *p; + + p = kmalloc(size, (GFP_KERNEL | __GFP_REPEAT)); + if (p) + return p; + + /* try allocating virtual memory */ + p = vmalloc(size); + if (p) + return p; + + return NULL; +} + +/* + * ibtrs_zalloc() - allocate kernel or virtual memory + * @size: size to be allocated + * + * The pointer returned must be freed with kvfree() + */ +void *ibtrs_zalloc(size_t size) +{ + void *p; + + p = kzalloc(size, GFP_KERNEL); + if (p) + return p; + + /* try allocating virtual memory */ + p = vzalloc(size); + if (p) + return p; + + return NULL; +} diff --git a/drivers/infiniband/ulp/ibtrs_lib/heartbeat.c b/drivers/infiniband/ulp/ibtrs_lib/heartbeat.c new file mode 100644 index 000..1575931 --- /dev/null
[PATCH 04/28] ibtrs_clt: add header file for exported interface
From: Jack WangUser module eg ibnbd_client will use this interface to transfer data later. Signed-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- include/rdma/ibtrs_clt.h | 316 +++ 1 file changed, 316 insertions(+) create mode 100644 include/rdma/ibtrs_clt.h diff --git a/include/rdma/ibtrs_clt.h b/include/rdma/ibtrs_clt.h new file mode 100644 index 000..4fc9b12 --- /dev/null +++ b/include/rdma/ibtrs_clt.h @@ -0,0 +1,316 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#if !defined(IBTRS_CLIENT_H) +#define IBTRS_CLIENT_H + +#include + +struct ibtrs_session; + +/** + * ibtrs_clt_open() - Open a session to a ibtrs_server + * @addr: The IPv4, IPv6 or GID address of the peer + * @pdu_sz: Size of extra payload which can be accessed after tag allocation. + * @priv: Pointer passed back on _clt_ops->sess_ev() invocation + * @max_inflight_msg: Max. number of parallel inflight messages for the session + * @max_segments: Max. number of segments per IO request + * @reconnect_delay_sec: time between reconnect tries + * @max_reconnect_attempts: Number of times to reconnect on error before giving + * up, 0 for * disabled, -1 for forever + * + * Starts session establishment with the ibtrs_server. The function can block + * up to ~2000ms until it returns. + * + * Return a valid pointer on success otherwise PTR_ERR. + * -EINVAL:The provided addr could not be resolved to an Infiniband + * address, the route to the host could not be resolved or + * ibtrs_clt_register() was not called before. + */ +struct ibtrs_session *ibtrs_clt_open(const struct sockaddr_storage *addr, +size_t pdu_sz, void *priv, +u8 reconnect_delay_sec, u16 max_segments, +s16 max_reconnect_attempts); + +/** + * ibtrs_clt_close() - Close a session + * @sess: Session handler, is freed on return + */ +int ibtrs_clt_close(struct ibtrs_session *sess); + +/** + * enum ibtrs_clt_rdma_ev - Events related to RDMA transfer operations + */ +enum ibtrs_clt_rdma_ev { + IBTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL, + IBTRS_CLT_RDMA_EV_RDMA_WRITE_COMPL, +}; + +/** + * enum ibtrs_sess_ev - Events about connectivity state of a session + * @IBTRS_CLT_SESS_EV_RECONNECTThe session was reconnected. + * @IBTRS_CLT_SESS_EV_DISCONNECTED The session was disconnected. + * @IBTRS_CLT_SESS_EV_MAX_RECONN_EXCEEDED Reconect attempts stopped
[PATCH 06/28] ibtrs_clt: add header file shared only in ibtrs_client
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- .../ulp/ibtrs_client/ibtrs_clt_internal.h | 244 + 1 file changed, 244 insertions(+) create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h diff --git a/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h new file mode 100644 index 000..7274b2d --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h @@ -0,0 +1,244 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#if !defined(IBTRS_CLT_INTERNAL_H) +#define IBTRS_CLT_INTERNAL_H + +#include + +enum ssm_state { + _SSM_STATE_MIN, + SSM_STATE_IDLE, + SSM_STATE_IDLE_RECONNECT, + SSM_STATE_WF_INFO, + SSM_STATE_WF_INFO_RECONNECT, + SSM_STATE_OPEN, + SSM_STATE_OPEN_RECONNECT, + SSM_STATE_CONNECTED, + SSM_STATE_RECONNECT, + SSM_STATE_RECONNECT_IMM, + SSM_STATE_CLOSE_DESTROY, + SSM_STATE_CLOSE_RECONNECT, + SSM_STATE_CLOSE_RECONNECT_IMM, + SSM_STATE_DISCONNECTED, + SSM_STATE_DESTROYED, + _SSM_STATE_MAX +}; + +enum ibtrs_fast_reg { + IBTRS_FAST_MEM_NONE, + IBTRS_FAST_MEM_FR, + IBTRS_FAST_MEM_FMR +}; + +struct ibtrs_stats_reconnects { + u32 successful_cnt; + u32 fail_cnt; +}; + +struct ibtrs_stats_wc_comp { + u32 max_wc_cnt; + u32 cnt; + u64 total_cnt; +}; + +struct ibtrs_stats_cpu_migration { + atomic_t *from; + int *to; +}; + +struct ibtrs_clt_stats_rdma_stats { + u64 cnt_read; + u64 size_total_read; + u64 cnt_write; + u64 size_total_write; + + u16 inflight; +}; + +#define MIN_LOG_SG 2 +#define MAX_LOG_SG 5 +#define MAX_LIN_SG BIT(MIN_LOG_SG) +#define SG_DISTR_LEN (MAX_LOG_SG - MIN_LOG_SG + MAX_LIN_SG + 1) + +struct ibtrs_clt_stats_rdma_lat_entry { + u64 read; + u64 write; +}; + +#define MAX_LOG_LATENCY16 +#define MIN_LOG_LATENCY0 + +struct ibtrs_clt_stats_user_ib_msgs { + u32 recv_msg_cnt; + u32 sent_msg_cnt; + u64 recv_size; + u64 sent_size; +}; + +struct ibtrs_clt_stats { + struct ibtrs_stats_cpu_migrationcpu_migr; + struct ibtrs_clt_stats_rdma_stats *rdma_stats; + u64 *sg_list_total; + u64 **sg_list_distr; + struct ibtrs_stats_reconnects reconnects; + struct
[PATCH 02/28] ibtrs: add header for log MICROs shared between ibtrs_client and ibtrs_server
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- include/rdma/ibtrs_log.h | 88 1 file changed, 88 insertions(+) create mode 100644 include/rdma/ibtrs_log.h diff --git a/include/rdma/ibtrs_log.h b/include/rdma/ibtrs_log.h new file mode 100644 index 000..28ff5b4 --- /dev/null +++ b/include/rdma/ibtrs_log.h @@ -0,0 +1,88 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef __IBTRS_LOG_H__ +#define __IBTRS_LOG_H__ +#include "ibtrs.h" + +#define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__) +#define DEB_RL(fmt, ...) pr_debug_ratelimited("ibtrs L%d " fmt, \ + __LINE__, ##__VA_ARGS__) +static inline void ibtrs_deb_msg_hdr(const char *prep, +const struct ibtrs_msg_hdr *hdr) +{ + DEB("%sibtrs msg hdr:\n" + "\ttype: %d\n" + "\ttsize: %d\n", prep, hdr->type, hdr->tsize); +} + +#define ERR_NP(fmt, ...) pr_err("ibtrs L%d ERR: " fmt, \ + __LINE__, ##__VA_ARGS__) + +#define WRN_NP(fmt, ...) pr_warn("ibtrs L%d WARN: " fmt, \ + __LINE__, ##__VA_ARGS__) +#define INFO_NP(fmt, ...) pr_info("ibtrs: " fmt, ##__VA_ARGS__) + +#define INFO_NP_RL(fmt, ...) pr_info_ratelimited("ibtrs: " fmt, ##__VA_ARGS__) + +#define ibtrs_prefix(sess) ((sess->hostname[0] != '\0') ? sess->hostname : \ + sess->addr) + +#define ERR(sess, fmt, ...) pr_err("ibtrs L%d <%s> ERR: " fmt, \ + __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__) +#define ERR_RL(sess, fmt, ...) pr_err_ratelimited("ibtrs L%d <%s> ERR: " fmt, \ + __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__) + +#define WRN(sess, fmt, ...) pr_warn("ibtrs L%d <%s> WARN: " fmt, \ + __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__) +#define WRN_RL(sess, fmt, ...) pr_warn_ratelimited("ibtrs L%d <%s> WARN: " \ + fmt, __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__) + +#define INFO(sess, fmt, ...) pr_info("ibtrs <%s>: " fmt, \ + ibtrs_prefix(sess), ##__VA_ARGS__) +#define INFO_RL(sess, fmt, ...) pr_info_ratelimited("ibtrs <%s>: " fmt, \ + ibtrs_prefix(sess), ##__VA_ARGS__) +#endif /*__IBTRS_LOG_H__*/ -- 2.7.4
[PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server
From: Jack WangSigned-off-by: Jack Wang Signed-off-by: Kleber Souza Signed-off-by: Danil Kipnis Signed-off-by: Roman Pen --- include/rdma/ibtrs.h | 514 +++ 1 file changed, 514 insertions(+) create mode 100644 include/rdma/ibtrs.h diff --git a/include/rdma/ibtrs.h b/include/rdma/ibtrs.h new file mode 100644 index 000..4fc572b --- /dev/null +++ b/include/rdma/ibtrs.h @@ -0,0 +1,514 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < m...@fholler.de> + * Jack Wang + * Kleber Souza + * Danil Kipnis + * Roman Pen + * Milind Dumbare + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *notice, this list of conditions, and the following disclaimer, + *without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + *substantially similar to the "NO WARRANTY" disclaimer below + *("Disclaimer") and any redistribution must be conditioned upon + *including a substantially similar Disclaimer requirement for further + *binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + *of any contributors may be used to endorse or promote products derived + *from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#ifndef __IBTRS_H +#define __IBTRS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IBTRS_SERVER_PORT 1234 +#define WC_ARRAY_SIZE 16 +#define IB_APM_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */ + +#define USR_MSG_CNT 64 +#define USR_CON_BUF_SIZE (USR_MSG_CNT * 2) /* double bufs for ACK's */ + +#define DEFAULT_HEARTBEAT_TIMEOUT_MS 2 +#define MIN_HEARTBEAT_TIMEOUT_MS 5000 +#define HEARTBEAT_INTV_MS 500 +#define HEARTBEAT_INTV_JIFFIES msecs_to_jiffies(HEARTBEAT_INTV_MS) + +#define MIN_RTR_CNT 1 +#define MAX_RTR_CNT 7 + +/* + * With the current size of the tag allocated on the client, 4K is the maximum + * number of tags we can allocate. (see IBNBD-2321) + * This number is also used on the client to allocate the IU for the user + * connection to receive the RDMA addresses from the server. + */ +#define MAX_SESS_QUEUE_DEPTH 4096 + +#define XX(a) case (a): return #a + +#define IBTRS_ADDRLEN sizeof("ipv6:[:::::::]") + +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode) +{ + switch (opcode) { + XX(IB_WC_SEND); + XX(IB_WC_RDMA_WRITE); + XX(IB_WC_RDMA_READ); + XX(IB_WC_COMP_SWAP); + XX(IB_WC_FETCH_ADD); + /* recv-side); inbound completion */ + XX(IB_WC_RECV); + XX(IB_WC_RECV_RDMA_WITH_IMM); + default: return "IB_WC_OPCODE_UNKNOWN"; + } +} + + +struct ib_session { + struct ib_pd*pd; + struct ib_mr*mr; + struct ib_event_handler event_handler; +}; + +struct ibtrs_ib_path { + union ib_gidp_sgid; + union ib_gidp_dgid; +}; + +struct ib_con { + struct ib_qp*qp cacheline_aligned; + struct ib_cq*cq cacheline_aligned; + struct ib_send_wr beacon; + struct rdma_cm_id *cm_id; + struct ibtrs_ib_pathpri_path; + struct ibtrs_ib_path cur_path; + char*addr; + char*hostname; +}; + +struct ibtrs_iu { +
[RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)
From: Jack WangThis series introduces IBNBD/IBTRS kernel modules. IBNBD (InfiniBand network block device) allows for an RDMA transfer of block IO over InfiniBand network. The driver presents itself as a block device on client side and transmits the block requests in a zero-copy fashion to the server-side via InfiniBand. The server part of the driver converts the incoming buffers back into BIOs and hands them down to the underlying block device. As soon as IO responses come back from the drive, they are being transmitted back to the client. We design and implement this solution based on our need for Cloud Computing, the key features are: - High throughput and low latency due to: 1) Only two rdma messages per IO 2) Simplified client side server memory management 3) Eliminated SCSI sublayer - Simple configuration and handling 1) Server side is completely passive: volumes do not need to be explicitly exported 2) Only IB port GID and device path needed on client side to map a block device 3) A device can be remapped automatically i.e. after storage reboot - Pinning of IO-related processing to the CPU of the producer For usage please refer to Documentation/IBNBD.txt in later patch. My colleague Danil Kpnis presents IBNBD in Vault-2017 about our design/feature/ tradeoff/performance: http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf The patchset is based on Linux 4.11-rc3. I've done functional tests with our tests framework on AMD64 machines with Mellanox CX-2 and CX-3. TODOs: - move some helpers to core - use new cq api, drain_cq etc - support poll callback in MQ - big endian machine support - better files layout We've learned a lot from other opensource project, namely SRP/SCST/LIO, etc, thanks all the contributors. We hope our IBNBD bring more value to the opensource world. A git tree is also avaiable at: https://github.com/xjtuwjp/linux-2.6/commits/ibnbdv0 As usual, comments and reviews are welcome. Jack Wang (28): ibtrs: add header shared between ibtrs_client and ibtrs_server ibtrs: add header for log MICROs shared between ibtrs_client and ibtrs_server ibtrs_lib: add common functions shared by client and server ibtrs_clt: add header file for exported interface ibtrs_clt: main functionality of ibtrs_client ibtrs_clt: add header file shared only in ibtrs_client ibtrs_clt: add files for sysfs interface ibtrs_clt: add Makefile and Kconfig ibtrs_srv: add header file for exported interface ibtrs_srv: add main functionality for ibtrs_server ibtrs_srv: add header shared in ibtrs_server ibtrs_srv: add sysfs interface ibtrs_srv: add Makefile and Kconfig ibnbd: add headers shared by ibnbd_client and ibnbd_server ibnbd: add shared library functions ibnbd_clt: add main functionality of ibnbd_client ibnbd_clt: add header shared in ibnbd_client ibnbd_clt: add sysfs interface ibnbd_clt: add log helpers ibnbd_clt: add Makefile and Kconfig ibnbd_srv: add header shared in ibnbd_server ibnbd_srv: add main functionality ibnbd_srv: add abstraction for submit IO to file or block device ibnbd_srv: add log helpers ibnbd_srv: add sysfs interface ibnbd_srv: add Makefile and Kconfig ibnbd: add doc for how to use ibnbd and sysfs interface MAINTRAINERS: Add maintainer for IBNBD/IBTRS Documentation/IBNBD.txt| 284 ++ MAINTAINERS| 14 + drivers/block/Kconfig |3 + drivers/block/Makefile |2 + drivers/block/ibnbd_client/Kconfig | 16 + drivers/block/ibnbd_client/Makefile|5 + drivers/block/ibnbd_client/ibnbd_clt.c | 2007 drivers/block/ibnbd_client/ibnbd_clt.h | 231 + drivers/block/ibnbd_client/ibnbd_clt_log.h | 79 + drivers/block/ibnbd_client/ibnbd_clt_sysfs.c | 863 drivers/block/ibnbd_client/ibnbd_clt_sysfs.h | 64 + drivers/block/ibnbd_inc/ibnbd-proto.h | 273 + drivers/block/ibnbd_inc/ibnbd.h| 55 + drivers/block/ibnbd_inc/log.h | 68 + drivers/block/ibnbd_lib/ibnbd-proto.c | 244 + drivers/block/ibnbd_lib/ibnbd.c| 108 + drivers/block/ibnbd_server/Kconfig | 16 + drivers/block/ibnbd_server/Makefile|3 + drivers/block/ibnbd_server/ibnbd_dev.c | 436 ++ drivers/block/ibnbd_server/ibnbd_dev.h | 149 + drivers/block/ibnbd_server/ibnbd_srv.c | 1074 drivers/block/ibnbd_server/ibnbd_srv.h | 115 + drivers/block/ibnbd_server/ibnbd_srv_log.h | 69 + drivers/block/ibnbd_server/ibnbd_srv_sysfs.c | 317 ++ drivers/block/ibnbd_server/ibnbd_srv_sysfs.h | 64 + drivers/infiniband/Kconfig |3 + drivers/infiniband/ulp/Makefile
Re: [PATCH v3] block: trace completion of all bios.
On Fri, Mar 24, 2017 at 8:07 AM, NeilBrownwrote: > > Currently only dm and md/raid5 bios trigger > trace_block_bio_complete(). Now that we have bio_chain() and > bio_inc_remaining(), it is not possible, in general, for a driver to > know when the bio is really complete. Only bio_endio() knows that. > > So move the trace_block_bio_complete() call to bio_endio(). > > Now trace_block_bio_complete() pairs with trace_block_bio_queue(). > Any bio for which a 'queue' event is traced, will subsequently > generate a 'complete' event. > > There are a few cases where completion tracing is not wanted. > 1/ If blk_update_request() has already generated a completion >trace event at the 'request' level, there is no point generating >one at the bio level too. In this case the bi_sector and bi_size >will have changed, so the bio level event would be wrong > > 2/ If the bio hasn't actually been queued yet, but is being aborted >early, then a trace event could be confusing. Some filesystems >call bio_endio() but do not want tracing. > > 3/ The bio_integrity code interposes itself by replacing bi_end_io, >then restoring it and calling bio_endio() again. This would produce >two identical trace events if left like that. > > To handle these, we introduce a flag BIO_TRACE_COMPLETION and only > produce the trace event when this is set. > We address point 1 above by clearing the flag in blk_update_request(). > We address point 2 above by only setting the flag when > generic_make_request() is called. > We address point 3 above by clearing the flag after generating a > completion event. > > When bio_split() is used on a bio, particularly in blk_queue_split(), > there is an extra complication. A new bio is split off the front, and > may be handle directly without going through generic_make_request(). > The old bio, which has been advanced, is passed to > generic_make_request(), so it will trigger a trace event a second > time. > Probably the best result when a split happens is to see a single > 'queue' event for the whole bio, then multiple 'complete' events - one > for each component. To achieve this was can: > - copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split() > - avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set. > This way, the split-off bio won't create a queue event, the original > won't either even if it re-submitted to generic_make_request(), > but both will produce completion events, each for their own range. > > So if generic_make_request() is called (which generates a QUEUED > event), then bi_endio() will create a single COMPLETE event for each > range that the bio is split into, unless the driver has explicitly > requested it not to. > > Signed-off-by: NeilBrown > --- > block/bio.c | 13 + > block/blk-core.c | 10 +- > drivers/md/dm.c | 1 - > drivers/md/raid5.c| 8 > include/linux/blk_types.h | 4 +++- > 5 files changed, 25 insertions(+), 11 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index 5eec5e08417f..c1272986133e 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1818,6 +1818,11 @@ static inline bool bio_remaining_done(struct bio *bio) > * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred > * way to end I/O on a bio. No one should call bi_end_io() directly on a > * bio unless they own it and thus know that it has an end_io function. > + * > + * bio_endio() can be called several times on a bio that has been chained > + * using bio_chain(). The ->bi_end_io() function will only be call the > + * last time. At this point the BLK_TA_COMPLETE tracing event will be > + * generated if BIO_TRACE_COMPLETION is set. > **/ > void bio_endio(struct bio *bio) > { > @@ -1838,6 +1843,11 @@ void bio_endio(struct bio *bio) > goto again; > } > > + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { > + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), > +bio, bio->bi_error); > + bio_clear_flag(bio, BIO_TRACE_COMPLETION); > + } > if (bio->bi_end_io) > bio->bi_end_io(bio); > } > @@ -1876,6 +1886,9 @@ struct bio *bio_split(struct bio *bio, int sectors, > > bio_advance(bio, split->bi_iter.bi_size); > > + if (bio_flagged(bio, BIO_TRACE_COMPLETION)) > + bio_set_flag(bio, BIO_TRACE_COMPLETION); > + > return split; > } > EXPORT_SYMBOL(bio_split); > diff --git a/block/blk-core.c b/block/blk-core.c > index 0eeb99ef654f..b34b5b1b1bbf 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1936,7 +1936,13 @@ generic_make_request_checks(struct bio *bio) > if (!blkcg_bio_issue_check(q, bio)) > return false; > > - trace_block_bio_queue(q, bio); > + if (!bio_flagged(bio,
Re: [PATCH v3 02/14] md: move two macros into md.h
On Fri, Mar 24, 2017 at 1:57 PM, NeilBrownwrote: > On Fri, Mar 17 2017, Ming Lei wrote: > >> Both raid1 and raid10 share common resync >> block size and page count, so move them into md.h. > > I don't think this is necessary. > These are just "magic" numbers. They don't have any real > meaning and so don't belong in md.h, or and .h file. The thing is that RESYNC_PAGES is needed in the following patch 3: [PATCH v3 03/14] md: prepare for managing resync I/O pages in clean way so how about moving the macros into raid1-10.h? Cause you suggest to create that header for holding the introduced helpers in patch3. Thanks, Ming
Re: [PATCH v3 03/14] md: prepare for managing resync I/O pages in clean way
On Fri, Mar 17 2017, Ming Lei wrote: > Now resync I/O use bio's bec table to manage pages, > this way is very hacky, and may not work any more > once multipage bvec is introduced. > > So introduce helpers and new data structure for > managing resync I/O pages more cleanly. > > Signed-off-by: Ming Lei> --- > drivers/md/md.h | 50 ++ > 1 file changed, 50 insertions(+) I don't think this should go in md.h Maybe create a "raid1-10.h" or similar if you really want to. NeilBrown > > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 1d63239a1be4..20c48032493b 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -720,4 +720,54 @@ static inline void mddev_check_writesame(struct mddev > *mddev, struct bio *bio) > #define RESYNC_BLOCK_SIZE (64*1024) > #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) > > +/* for managing resync I/O pages */ > +struct resync_pages { > + unsignedidx;/* for get/put page from the pool */ > + void*raid_bio; > + struct page *pages[RESYNC_PAGES]; > +}; > + > +static inline int resync_alloc_pages(struct resync_pages *rp, > + gfp_t gfp_flags) > +{ > + int i; > + > + for (i = 0; i < RESYNC_PAGES; i++) { > + rp->pages[i] = alloc_page(gfp_flags); > + if (!rp->pages[i]) > + goto out_free; > + } > + > + return 0; > + > + out_free: > + while (--i >= 0) > + put_page(rp->pages[i]); > + return -ENOMEM; > +} > + > +static inline void resync_free_pages(struct resync_pages *rp) > +{ > + int i; > + > + for (i = 0; i < RESYNC_PAGES; i++) > + put_page(rp->pages[i]); > +} > + > +static inline void resync_get_all_pages(struct resync_pages *rp) > +{ > + int i; > + > + for (i = 0; i < RESYNC_PAGES; i++) > + get_page(rp->pages[i]); > +} > + > +static inline struct page *resync_fetch_page(struct resync_pages *rp, > + unsigned idx) > +{ > + if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) > + return NULL; > + return rp->pages[idx]; > +} > + > #endif /* _MD_MD_H */ > -- > 2.9.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-raid" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html signature.asc Description: PGP signature