Re: [PATCH 08/28] ibtrs_clt: add Makefile and Kconfig

2017-03-24 Thread kbuild test robot
Hi Jack,

[auto build test WARNING on linus/master]
[also build test WARNING on v4.11-rc3 next-20170324]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Jack-Wang/INFINIBAND-NETWORK-BLOCK-DEVICE-IBNBD/20170325-101629
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from include/linux/printk.h:329:0,
from include/linux/kernel.h:13,
from include/linux/list.h:8,
from include/linux/module.h:9,
from drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:47:
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 
'process_open_rsp':
>> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:859:7: warning: cast to 
>> pointer from integer of different size [-Wint-to-pointer-cast]
  (void *)sess->srv_rdma_addr[i],
  ^
   include/linux/dynamic_debug.h:127:10: note: in definition of macro 
'dynamic_pr_debug'
   ##__VA_ARGS__);  \
 ^~~
>> include/rdma/ibtrs_log.h:51:23: note: in expansion of macro 'pr_debug'
#define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__)
  ^~~~
>> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:857:3: note: in expansion of 
>> macro 'DEB'
  DEB("Adding contiguous buffer %d, size %u, addr: 0x%p,"
  ^~~
   In file included from include/linux/kernel.h:13:0,
from include/linux/list.h:8,
from include/linux/module.h:9,
from drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:47:
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 
'ibtrs_map_desc':
>> include/rdma/ibtrs_log.h:51:32: warning: format '%llu' expects argument of 
>> type 'long long unsigned int', but argument 4 has type 'dma_addr_t {aka 
>> unsigned int}' [-Wformat=]
#define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__)
   ^
   include/linux/printk.h:285:21: note: in definition of macro 'pr_fmt'
#define pr_fmt(fmt) fmt
^~~
   include/linux/printk.h:333:2: note: in expansion of macro 'dynamic_pr_debug'
 dynamic_pr_debug(fmt, ##__VA_ARGS__)
 ^~~~
>> include/rdma/ibtrs_log.h:51:23: note: in expansion of macro 'pr_debug'
#define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__)
  ^~~~
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1106:2: note: in expansion 
of macro 'DEB'
 DEB("dma_addr %llu, key %u, dma_len %u\n", dma_addr, rkey, dma_len);
 ^~~
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 
'ibtrs_post_send_rdma':
>> drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1440:23: warning: cast from 
>> pointer to integer of different size [-Wpointer-to-int-cast]
  addr + off, (u64)req->iu, imm,
  ^
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 
'ibtrs_post_send_rdma_desc':
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1565:17: warning: cast from 
pointer to integer of different size [-Wpointer-to-int-cast]
  addr, (u64)req->iu, imm,
^
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 
'process_err_wc':
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1882:7: warning: cast to 
pointer from integer of different size [-Wint-to-pointer-cast]
 iu = (struct ibtrs_iu *)wc->wr_id;
  ^
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c: In function 'process_wcs':
   drivers/infiniband/ulp/ibtrs_client/ibtrs_clt.c:1922:8: warning: cast to 
pointer from integer of different size [-Wint-to-pointer-cast]
  iu = (struct ibtrs_iu *)wc.wr_id;
   ^
--
   In file included from include/linux/printk.h:6:0,
from 
drivers/infiniband/ulp/ibtrs_client/../ibtrs_lib/ibtrs-proto.c:48:
   drivers/infiniband/ulp/ibtrs_client/../ibtrs_lib/ibtrs-proto.c: In function 
'ibtrs_validate_msg_sess_open_resp':
   include/linux/kern_levels.h:4:18: warning: format '%lu' expects argument of 
type 'long unsigned int', but argument 4 has type 'unsigned int' [-Wformat=]
#define KERN_SOH "\001"  /* ASCII Start Of Header */
 ^
   include/linux/kern_levels.h:10:18: note: in expansion of macro 'KERN_SOH'
#define KERN_ERR KERN_SOH "3" /* error conditions */
 ^~~~
   include/linux/printk.h:301:9: note: in expansion of macro 'KERN_ERR'
 printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS

Re: [PATCH] block: constify struct blk_integrity_profile

2017-03-24 Thread Jens Axboe
On 03/24/2017 07:03 PM, Eric Biggers wrote:
> From: Eric Biggers 
> 
> blk_integrity_profile's are never modified, so mark them 'const' so that
> they are placed in .rodata and benefit from memory protection.

Thanks, that's a nice change. Applied for 4.12.

-- 
Jens Axboe



[PATCH] block: constify struct blk_integrity_profile

2017-03-24 Thread Eric Biggers
From: Eric Biggers 

blk_integrity_profile's are never modified, so mark them 'const' so that
they are placed in .rodata and benefit from memory protection.

Signed-off-by: Eric Biggers 
---
 block/blk-integrity.c  |  2 +-
 block/t10-pi.c |  8 
 include/linux/genhd.h  | 10 +-
 include/linux/t10-pi.h |  8 
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..b3622cb00fc2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter 
*iter)
return 0;
 }
 
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct 
blk_integrity_iter *iter)
return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
 
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
.name   = "T10-DIF-TYPE1-CRC",
.generate_fn= t10_pi_type1_generate_crc,
.verify_fn  = t10_pi_type1_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type1_crc);
 
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
.name   = "T10-DIF-TYPE1-IP",
.generate_fn= t10_pi_type1_generate_ip,
.verify_fn  = t10_pi_type1_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type1_ip);
 
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
.name   = "T10-DIF-TYPE3-CRC",
.generate_fn= t10_pi_type3_generate_crc,
.verify_fn  = t10_pi_type3_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type3_crc);
 
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
.name   = "T10-DIF-TYPE3-IP",
.generate_fn= t10_pi_type3_generate_ip,
.verify_fn  = t10_pi_type3_verify_ip,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..9e11082c7f9b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 struct blk_integrity {
-   struct blk_integrity_profile*profile;
-   unsigned char   flags;
-   unsigned char   tuple_size;
-   unsigned char   interval_exp;
-   unsigned char   tag_size;
+   const struct blk_integrity_profile  *profile;
+   unsigned char   flags;
+   unsigned char   tuple_size;
+   unsigned char   interval_exp;
+   unsigned char   tag_size;
 };
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd33544..9375d23a24e7 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
 };
 
 
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
 
 #endif
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock

2017-03-24 Thread Tahsin Erdogan
blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Do struct blkcg_gq allocations outside the queue spinlock to allow
blocking during memory allocations.

Suggested-by: Tejun Heo 
Signed-off-by: Tahsin Erdogan 
---
v6:
  Due to Jens' objection to conditionally dropping locks based on gfp
  flags, go back to v1 approach.
  Perform queue bypass and policy enabled checks at every iteration.
  Add blkg_lookup_check() to reduce code duplication.

v5:
  Removed stale blkg_alloc() in blkcg_init_queue()

  Pushed down radix_tree_preload() into blkg_create() because it
  disables preemption on return and makes it unsafe to call blocking
  memory allocations.

v4:
  Simplified error checking in blkg_create()
  Factored out __blkg_lookup_create()

v3:
  Pushed down all blkg allocations into blkg_create()

v2:
  Moved blkg creation into blkg_lookup_create() to avoid duplicating
  blkg_lookup_create() logic.

 block/blk-cgroup.c | 123 ++---
 1 file changed, 98 insertions(+), 25 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct 
blkcg_gq *blkg,
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
+{
+   WARN_ON_ONCE(!rcu_read_lock_held());
+   lockdep_assert_held(q->queue_lock);
+
+   if (!blkcg_policy_enabled(q, pol))
+   return ERR_PTR(-EOPNOTSUPP);
+
+   /*
+* This could be the first entry point of blkcg implementation and
+* we shouldn't allow anything to go through for a bypassing queue.
+*/
+   if (unlikely(blk_queue_bypass(q)))
+   return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+   return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct 
blkcg_policy *pol,
__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
struct gendisk *disk;
+   struct request_queue *q;
struct blkcg_gq *blkg;
struct module *owner;
unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct 
blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
-   owner = disk->fops->owner;
-   put_disk(disk);
-   module_put(owner);
-   return -ENODEV;
+   ret = -ENODEV;
+   goto fail;
}
 
-   rcu_read_lock();
-   spin_lock_irq(disk->queue->queue_lock);
+   q = disk->queue;
 
-   if (blkcg_policy_enabled(disk->queue, pol))
-   blkg = blkg_lookup_create(blkcg, disk->queue);
-   else
-   blkg = ERR_PTR(-EOPNOTSUPP);
+   rcu_read_lock();
+   spin_lock_irq(q->queue_lock);
 
+   blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
+   goto fail_unlock;
+   }
+
+   if (blkg)
+   goto success;
+
+   /*
+* Create blkgs walking down from blkcg_root to @blkcg, so that all
+* non-root blkgs have access to their parents.
+*/
+   while (true) {
+   struct blkcg *pos = blkcg;
+   struct blkcg *parent;
+   struct blkcg_gq *new_blkg;
+
+   parent = blkcg_parent(blkcg);
+   while (parent && !__blkg_lookup(parent, q, false)) {
+   pos = 

Re: [PATCH] block: correct documentation for blkdev_issue_discard() flags

2017-03-24 Thread Jens Axboe
On 03/24/2017 03:39 PM, Eric Biggers wrote:
> On Mon, Jan 23, 2017 at 11:41:39AM -0800, Eric Biggers wrote:
>> From: Eric Biggers 
>>
>> BLKDEV_IFL_* flags no longer exist; blkdev_issue_discard() now actually
>> takes BLKDEV_DISCARD_* flags.
>>
>> Signed-off-by: Eric Biggers 
>> ---
>>  block/blk-lib.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index ed89c8f4b2a0..463b76dd566f 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
>>   * @sector: start sector
>>   * @nr_sects:   number of sectors to discard
>>   * @gfp_mask:   memory allocation flags (for bio_alloc)
>> - * @flags:  BLKDEV_IFL_* flags to control behaviour
>> + * @flags:  BLKDEV_DISCARD_* flags to control behaviour
>>   *
>>   * Description:
>>   *Issue a discard request for the sectors in question.
>> -- 
>> 2.11.0.483.g087da7b7c-goog
>>
> 
> Ping?

Sorry, looks like that got lost. I've applied this, and your other
patch. Thanks for the reminder!

-- 
Jens Axboe



Re: [PATCH] block: correct documentation for blkdev_issue_discard() flags

2017-03-24 Thread Eric Biggers
On Mon, Jan 23, 2017 at 11:41:39AM -0800, Eric Biggers wrote:
> From: Eric Biggers 
> 
> BLKDEV_IFL_* flags no longer exist; blkdev_issue_discard() now actually
> takes BLKDEV_DISCARD_* flags.
> 
> Signed-off-by: Eric Biggers 
> ---
>  block/blk-lib.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index ed89c8f4b2a0..463b76dd566f 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
>   * @sector:  start sector
>   * @nr_sects:number of sectors to discard
>   * @gfp_mask:memory allocation flags (for bio_alloc)
> - * @flags:   BLKDEV_IFL_* flags to control behaviour
> + * @flags:   BLKDEV_DISCARD_* flags to control behaviour
>   *
>   * Description:
>   *Issue a discard request for the sectors in question.
> -- 
> 2.11.0.483.g087da7b7c-goog
> 

Ping?


Re: [PATCH 0/4] nbd fixes for this cycle

2017-03-24 Thread Jens Axboe
On 03/24/2017 12:08 PM, Josef Bacik wrote:
> These 4 patches are to fix up various regressions and problems in NBD.  The
> ERESTARTSYS is the biggest patch but has been pretty well tested with a debug
> patch that forced the behavior to happen.  Everything else is relatively 
> small,
> and the queue timeout patch is a regression from last cycle.  Thanks,

Added for 4.11.

-- 
Jens Axboe



Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()

2017-03-24 Thread Bart Van Assche
On Sat, 2017-03-25 at 01:38 +0800, Ming Lei wrote:
> As I explained, the dying flag should only be mentioned after we change
> the code in blk_set_queue_dying().

Hello Ming,

If patches 2 and 4 would be combined into a single patch then it wouldn't
be necessary anymore to update the comment introduced in patch 2 in patch 4.
I think that would make this patch series easier to review.

Since the issues fixed by your patches are longstanding issues, have you
considered to add a "Cc: stable" tag?

Thanks,

Bart.

Re: [PATCH] blk-mq: include errors in did_work calculation

2017-03-24 Thread Bart Van Assche
On Fri, 2017-03-24 at 11:39 -0600, Jens Axboe wrote:
> Currently we return true in blk_mq_dispatch_rq_list() if we queued IO
> successfully, but we really want to return whether or not the we made
> progress. Progress includes if we got an error return.  If we don't,
> this can lead to a hang in blk_mq_sched_dispatch_requests() when a
> driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of
> manually ending the IO in error and return BLK_MQ_QUEUE_OK.

Reviewed-by: Bart Van Assche 

Re: [PATCH] block: remove bio_clone_bioset_partial()

2017-03-24 Thread Jens Axboe
On 03/24/2017 11:55 AM, Shaohua Li wrote:
> commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced
> bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is
> rewritten by Ming. We don't need the API any more, so revert the commit.
> 
> Jens,
> this depends on Ming's patches, so it would be great I put this to md branch.

Looks fine to me, feel free to do so.

Reviewed-by: Jens Axboe 

-- 
Jens Axboe



[PATCH 2/4] nbd: set rq->errors to actual error code

2017-03-24 Thread Josef Bacik
From: Josef Bacik 

We've been relying on the block layer to assume rq->errors being set
translates into -EIO.  I noticed in testing that sometimes this isn't
true, and really there's not much of a reason to have a counter instead
of just using -EIO.  So set it properly so we don't leak random numbers
to unsuspecting victims.

Signed-off-by: Josef Bacik 
---
 drivers/block/nbd.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3d1fc37a..dbc22f4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -192,7 +192,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct 
request *req,
 
dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down 
connection\n");
set_bit(NBD_TIMEDOUT, >runtime_flags);
-   req->errors++;
+   req->errors = -EIO;
 
mutex_lock(>config_lock);
sock_shutdown(nbd);
@@ -432,7 +432,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device 
*nbd, int index)
if (ntohl(reply.error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error 
(%d)\n",
ntohl(reply.error));
-   req->errors++;
+   req->errors = -EIO;
return cmd;
}
 
@@ -448,7 +448,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device 
*nbd, int index)
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk), "Receive data 
failed (result %d)\n",
result);
-   req->errors++;
+   req->errors = -EIO;
return cmd;
}
dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes 
data\n",
@@ -518,7 +518,7 @@ static void nbd_clear_req(struct request *req, void *data, 
bool reserved)
if (!blk_mq_request_started(req))
return;
cmd = blk_mq_rq_to_pdu(req);
-   req->errors++;
+   req->errors = -EIO;
nbd_end_request(cmd);
 }
 
-- 
2.7.4



[PATCH 4/4] nbd: replace kill_bdev() with __invalidate_device()

2017-03-24 Thread Josef Bacik
From: Ratna Manoj Bolla 

When a filesystem is mounted on a nbd device and on a disconnect, because
of kill_bdev(), and resetting bdev size to zero, buffer_head mappings are
getting destroyed under mounted filesystem.

After a bdev size reset(i.e bdev->bd_inode->i_size = 0) on a disconnect,
followed by a sys_umount(),
generic_shutdown_super()->...
->__sync_blockdev()->...
-blkdev_writepages()->...
->do_invalidatepage()->...
-discard_buffer()   is discarding superblock buffer_head assumed
to be in mapped state by ext4_commit_super().

[mlin: ported to 4.11-rc2]
Signed-off-by: Ratna Manoj Bolla 
---
 drivers/block/nbd.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index b0003da..d8a2356 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -126,7 +126,8 @@ static const char *nbdcmd_to_ascii(int cmd)
 
 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 {
-   bd_set_size(bdev, 0);
+   if (bdev->bd_openers <= 1)
+   bd_set_size(bdev, 0);
set_capacity(nbd->disk, 0);
kobject_uevent(_to_dev(nbd)->kobj, KOBJ_CHANGE);
 
@@ -665,6 +666,8 @@ static void nbd_reset(struct nbd_device *nbd)
 
 static void nbd_bdev_reset(struct block_device *bdev)
 {
+   if (bdev->bd_openers > 1)
+   return;
set_device_ro(bdev, false);
bdev->bd_inode->i_size = 0;
if (max_part > 0) {
@@ -728,7 +731,8 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct 
block_device *bdev)
 {
sock_shutdown(nbd);
nbd_clear_que(nbd);
-   kill_bdev(bdev);
+
+   __invalidate_device(bdev, true);
nbd_bdev_reset(bdev);
/*
 * We want to give the run thread a chance to wait for everybody
-- 
2.7.4



Re: [PATCH] blk-mq: include errors in did_work calculation

2017-03-24 Thread Omar Sandoval
On Fri, Mar 24, 2017 at 11:39:10AM -0600, Jens Axboe wrote:
> Currently we return true in blk_mq_dispatch_rq_list() if we queued IO
> successfully, but we really want to return whether or not the we made
> progress. Progress includes if we got an error return.  If we don't,
> this can lead to a hang in blk_mq_sched_dispatch_requests() when a
> driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of
> manually ending the IO in error and return BLK_MQ_QUEUE_OK.
> 
> Signed-off-by: Jens Axboe 

Reviewed-by: Omar Sandoval 


Re: [PATCH] blk-mq: include errors in did_work calculation

2017-03-24 Thread Josef Bacik

> On Mar 24, 2017, at 1:39 PM, Jens Axboe  wrote:
> 
> Currently we return true in blk_mq_dispatch_rq_list() if we queued IO
> successfully, but we really want to return whether or not the we made
> progress. Progress includes if we got an error return.  If we don't,
> this can lead to a hang in blk_mq_sched_dispatch_requests() when a
> driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of
> manually ending the IO in error and return BLK_MQ_QUEUE_OK.
> 
> Signed-off-by: Jens Axboe 
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index a4546f060e80..e3b09abf9d5b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -978,7 +978,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
> struct list_head *list)
>   struct request *rq;
>   LIST_HEAD(driver_list);
>   struct list_head *dptr;
> - int queued, ret = BLK_MQ_RQ_QUEUE_OK;
> + int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
> 
>   /*
>* Start off with dptr being NULL, so we start the first request
> @@ -989,7 +989,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
> struct list_head *list)
>   /*
>* Now process all the entries, sending them to the driver.
>*/
> - queued = 0;
> + errors = queued = 0;
>   while (!list_empty(list)) {
>   struct blk_mq_queue_data bd;
> 
> @@ -1046,6 +1046,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx 
> *hctx, struct list_head *list)
>   default:
>   pr_err("blk-mq: bad return on queue: %d\n", ret);
>   case BLK_MQ_RQ_QUEUE_ERROR:
> + errors++;
>   rq->errors = -EIO;
>   blk_mq_end_request(rq, rq->errors);
>   break;
> @@ -1097,7 +1098,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx 
> *hctx, struct list_head *list)
>   blk_mq_run_hw_queue(hctx, true);
>   }
> 
> - return queued != 0;
> + return (queued + errors) != 0;
> }
> 
> static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> 

Thanks this fixed it, you can add

Tested-by: Josef Bacik 

Thanks,

Josef

[PATCH] block: remove bio_clone_bioset_partial()

2017-03-24 Thread Shaohua Li
commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced
bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is
rewritten by Ming. We don't need the API any more, so revert the commit.

Jens,
this depends on Ming's patches, so it would be great I put this to md branch.

Cc: Christoph Hellwig 
Cc: Jens Axboe 
Cc: Ming Lei 
Signed-off-by: Shaohua Li 
---
 block/bio.c | 61 -
 include/linux/bio.h | 11 ++
 2 files changed, 15 insertions(+), 57 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 1ccff0d..0364359 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -631,20 +631,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t 
gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs, int offset,
- int size)
+/**
+ * bio_clone_bioset - clone a bio
+ * @bio_src: bio to clone
+ * @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Clone bio. Caller will own the returned bio, but not the actual data it
+ * points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+struct bio_set *bs)
 {
struct bvec_iter iter;
struct bio_vec bv;
struct bio *bio;
-   struct bvec_iter iter_src = bio_src->bi_iter;
-
-   /* for supporting partial clone */
-   if (offset || size != bio_src->bi_iter.bi_size) {
-   bio_advance_iter(bio_src, _src, offset);
-   iter_src.bi_size = size;
-   }
 
/*
 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -668,8 +669,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, 
gfp_t gfp_mask,
 *__bio_clone_fast() anyways.
 */
 
-   bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
-  _src), bs);
+   bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
if (!bio)
return NULL;
bio->bi_bdev= bio_src->bi_bdev;
@@ -686,7 +686,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, 
gfp_t gfp_mask,
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
break;
default:
-   __bio_for_each_segment(bv, bio_src, iter, iter_src)
+   bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
break;
}
@@ -705,44 +705,9 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, 
gfp_t gfp_mask,
 
return bio;
 }
-
-/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-struct bio_set *bs)
-{
-   return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
- bio_src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_clone_bioset);
 
 /**
- * bio_clone_bioset_partial - clone a partial bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- * @offset: cloned starting from the offset
- * @size: size for the cloned bio
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
-struct bio_set *bs, int offset,
-int size)
-{
-   return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
-/**
  * bio_add_pc_page -   attempt to add page to bio
  * @q: the target queue
  * @bio: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 42b62a0..fafef63 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct 
bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
+static inline unsigned bio_segments(struct bio *bio)
 {
unsigned segs = 0;
struct bio_vec bv;
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, 
struct bvec_iter *bvec)
break;
}
 
-   

Re: [PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()

2017-03-24 Thread Ming Lei
On Sat, Mar 25, 2017 at 1:29 AM, Bart Van Assche
 wrote:
> On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote:
>> As the .q_usage_counter is used by both legacy and
>> mq path, we need to block new I/O if queue becomes
>> dead in blk_queue_enter().
>>
>> So rename it and we can use this function in both
>> pathes.
>
> Should "pathes" be changed into "paths" in the commit message? Additionally,
> this patch breaks the symmetry the comment in blk_mq_freeze_queue() refers
> to. Anyway:

Really? Is there one function named as blk_mq_freeze_queue_end()?

The comment means blk_mq_freeze_queue() vs. blk_mq_unfreeze_queue(), which can't
be affected by this patch.


Thanks,
Ming Lei


Re: [PATCH v2 4/4] block: block new I/O just after queue is set as dying

2017-03-24 Thread Bart Van Assche
On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote: 
> + /* block new I/O coming */
> + blk_freeze_queue_start(q);

As I have already mentioned two times, the comment above
blk_freeze_queue_start() should be made more clear. It should mention that
without that call blk_queue_enter() won't check the "dying" flag after it
has been set. If that is not mentioned in a comment the next person who
reads the blk_set_queue_dying() function will wonder why the
blk_freeze_queue_start() call is really needed and whether it can be removed.

>   /*
>* read pair of barrier in blk_freeze_queue_start(),
>* we need to order reading DEAD flag of .q_usage_counter
> -  * and reading .mq_freeze_depth, otherwise the following
> -  * wait may never return if the two read are reordered.
> +  * and reading .mq_freeze_depth or dying flag, otherwise
> +  * the following wait may never return if the two read
> +  * are reordered.
>*/
>   smp_rmb();

Please fix the spelling in the above comment ("two read").

Thanks,

Bart.


[PATCH] blk-mq: include errors in did_work calculation

2017-03-24 Thread Jens Axboe
Currently we return true in blk_mq_dispatch_rq_list() if we queued IO
successfully, but we really want to return whether or not the we made
progress. Progress includes if we got an error return.  If we don't,
this can lead to a hang in blk_mq_sched_dispatch_requests() when a
driver is draining IO by returning BLK_MQ_QUEUE_ERROR instead of
manually ending the IO in error and return BLK_MQ_QUEUE_OK.

Signed-off-by: Jens Axboe 

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a4546f060e80..e3b09abf9d5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -978,7 +978,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
struct request *rq;
LIST_HEAD(driver_list);
struct list_head *dptr;
-   int queued, ret = BLK_MQ_RQ_QUEUE_OK;
+   int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
/*
 * Start off with dptr being NULL, so we start the first request
@@ -989,7 +989,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
/*
 * Now process all the entries, sending them to the driver.
 */
-   queued = 0;
+   errors = queued = 0;
while (!list_empty(list)) {
struct blk_mq_queue_data bd;
 
@@ -1046,6 +1046,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
default:
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
+   errors++;
rq->errors = -EIO;
blk_mq_end_request(rq, rq->errors);
break;
@@ -1097,7 +1098,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
blk_mq_run_hw_queue(hctx, true);
}
 
-   return queued != 0;
+   return (queued + errors) != 0;
 }
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)



Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()

2017-03-24 Thread Ming Lei
On Sat, Mar 25, 2017 at 1:24 AM, Bart Van Assche
 wrote:
> On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote:
>> Without the barrier, reading DEAD flag of .q_usage_counter
>> and reading .mq_freeze_depth may be reordered, then the
>> following wait_event_interruptible() may never return.
>>
>> Signed-off-by: Ming Lei 
>> ---
>>  block/blk-core.c | 8 
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index ad388d5e309a..44eed17319c0 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool 
>> nowait)
>>   if (nowait)
>>   return -EBUSY;
>>
>> + /*
>> +  * read pair of barrier in blk_mq_freeze_queue_start(),
>> +  * we need to order reading DEAD flag of .q_usage_counter
>> +  * and reading .mq_freeze_depth, otherwise the following
>> +  * wait may never return if the two read are reordered.
>> +  */
>> + smp_rmb();
>> +
>>   ret = wait_event_interruptible(q->mq_freeze_wq,
>>   !atomic_read(>mq_freeze_depth) ||
>>   blk_queue_dying(q));
>
> Hello Ming,
>
> The code looks fine to me but the comment not. You probably wanted to refer
> to the "dying" flag instead of the "dead" flag? The read order has to be

No, looks you misunderstand the issue.

I mean the order between reading __PERCPU_REF_DEAD of .q_usage_counter
and reading .mq_freeze_depth should be enhanced, especially it is in
blk_queue_enter() vs. blk_mq_freeze_queue_start().

In the last patch, you will find the dying flag is mentioned in above comment
after we call blk_freeze_queue_start() just after the dying flag is set.

> enforced for the "dying" flag and q_usage_counter because of the order in
> which these are set by blk_set_queue_dying().

As I explained, the dying flag should only be mentioned after we change
the code in blk_set_queue_dying().


Thanks,
Ming Lei


Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()

2017-03-24 Thread Bart Van Assche
On Fri, 2017-03-24 at 20:36 +0800, Ming Lei wrote:
> Without the barrier, reading DEAD flag of .q_usage_counter
> and reading .mq_freeze_depth may be reordered, then the
> following wait_event_interruptible() may never return.
> 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-core.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ad388d5e309a..44eed17319c0 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
>   if (nowait)
>   return -EBUSY;
>  
> + /*
> +  * read pair of barrier in blk_mq_freeze_queue_start(),
> +  * we need to order reading DEAD flag of .q_usage_counter
> +  * and reading .mq_freeze_depth, otherwise the following
> +  * wait may never return if the two read are reordered.
> +  */
> + smp_rmb();
> +
>   ret = wait_event_interruptible(q->mq_freeze_wq,
>   !atomic_read(>mq_freeze_depth) ||
>   blk_queue_dying(q));

Hello Ming,

The code looks fine to me but the comment not. You probably wanted to refer
to the "dying" flag instead of the "dead" flag? The read order has to be
enforced for the "dying" flag and q_usage_counter because of the order in
which these are set by blk_set_queue_dying().

Thanks,

Bart.

Re: [PATCH v3 02/14] md: move two macros into md.h

2017-03-24 Thread Shaohua Li
On Fri, Mar 24, 2017 at 04:57:37PM +1100, Neil Brown wrote:
> On Fri, Mar 17 2017, Ming Lei wrote:
> 
> > Both raid1 and raid10 share common resync
> > block size and page count, so move them into md.h.
> 
> I don't think this is necessary.
> These are just "magic" numbers.  They don't have any real
> meaning and so don't belong in md.h, or and .h file.
> 
> Possibly we should find more meaningful numbers, or make them auto-size
> or something.  I'm also happy for them to stay as they are for now.
> But I don't think we should pretend that they are meaningful.

I had the same concern when I looked at this patch firstly. The number for
raid1/10 doesn't need to be the same. But if we don't move the number to a
generic header, the third patch will become a little more complicated. I
eventually ignored this issue. If we really need different number for raid1/10,
lets do it at that time.

I think your suggestion that moving the number to raid1-10.h makes sense, and
add a comment declaring the number isn't required to be the same for raid1/10.

Thanks,
Shaohua


Re: [PATCH v3 08/14] block: introduce bio_copy_data_partial

2017-03-24 Thread Jens Axboe
On 03/16/2017 10:12 AM, Ming Lei wrote:
> Turns out we can use bio_copy_data in raid1's write behind,
> and we can make alloc_behind_pages() more clean/efficient,
> but we need to partial version of bio_copy_data().
> 
> Signed-off-by: Ming Lei 

Reviewed-by: Jens Axboe 

Shaohua, feel free to pull this through the md tree, that will be much
easier.

-- 
Jens Axboe



Re: [PATCH v2 2/4] block: add a read barrier in blk_queue_enter()

2017-03-24 Thread Hannes Reinecke
On 03/24/2017 01:36 PM, Ming Lei wrote:
> Without the barrier, reading DEAD flag of .q_usage_counter
> and reading .mq_freeze_depth may be reordered, then the
> following wait_event_interruptible() may never return.
> 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-core.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ad388d5e309a..44eed17319c0 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
>   if (nowait)
>   return -EBUSY;
>  
> + /*
> +  * read pair of barrier in blk_mq_freeze_queue_start(),
> +  * we need to order reading DEAD flag of .q_usage_counter
> +  * and reading .mq_freeze_depth, otherwise the following
> +  * wait may never return if the two read are reordered.
> +  */
> + smp_rmb();
> +
>   ret = wait_event_interruptible(q->mq_freeze_wq,
>   !atomic_read(>mq_freeze_depth) ||
>   blk_queue_dying(q));
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes Reinecke   zSeries & Storage
h...@suse.de  +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)


Re: [PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()

2017-03-24 Thread Hannes Reinecke
On 03/24/2017 01:36 PM, Ming Lei wrote:
> As the .q_usage_counter is used by both legacy and
> mq path, we need to block new I/O if queue becomes
> dead in blk_queue_enter().
> 
> So rename it and we can use this function in both
> pathes.
> 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-core.c  |  2 +-
>  block/blk-mq.c| 10 +-
>  drivers/block/mtip32xx/mtip32xx.c |  2 +-
>  drivers/nvme/host/core.c  |  2 +-
>  include/linux/blk-mq.h|  2 +-
>  5 files changed, 9 insertions(+), 9 deletions(-)
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes Reinecke   zSeries & Storage
h...@suse.de  +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)


Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Jinpu Wang
On Fri, Mar 24, 2017 at 3:31 PM, Johannes Thumshirn  wrote:
> On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote:
>> >> +
>> >> +#define XX(a) case (a): return #a
>> >
>> > please no macros with retun in them and XX isn't quite too descriptive as
>> > well.
>> >
>> > [...]
>> >
>> >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> >> +{
>> >> + switch (opcode) {
>> >> + XX(IB_WC_SEND);
>> >> + XX(IB_WC_RDMA_WRITE);
>> >> + XX(IB_WC_RDMA_READ);
>> >> + XX(IB_WC_COMP_SWAP);
>> >> + XX(IB_WC_FETCH_ADD);
>> >> + /* recv-side); inbound completion */
>> >> + XX(IB_WC_RECV);
>> >> + XX(IB_WC_RECV_RDMA_WITH_IMM);
>> >> + default: return "IB_WC_OPCODE_UNKNOWN";
>> >> + }
>> >> +}
>> >
>> > How about:
>> >
>> > struct {
>> > char *name;
>> > enum ib_wc_opcode opcode;
>> > } ib_wc_opcode_table[] = {
>> > { stringyfy(IB_WC_SEND), IB_WC_SEND },
>> > { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
>> > { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
>> > { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
>> > { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
>> > { stringyfy(IB_WC_RECV), IB_WC_RECV },
>> > { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
>> > { NULL, 0 },
>> > };
>> >
>> > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> > {
>> > int i;
>> >
>> > for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
>> > if (ib_wc_opcode_table[i].opcode == opcode)
>> > return ib_wc_opcode_table[i].name;
>> >
>> > return "IB_WC_OPCODE_UNKNOWN";
>> > }
>> >
>> Looks nice, might be better to put it into ib_verbs.h?
>
> Probably yes, as are your kvec functions for lib/iov_iter.c
Thanks, will do in next round!

>
> [...]
>
>> > What about resolving the kernel bug instead of making workarounds?
>> I tried to send a patch upsteam, but was rejected by Sean.
>> http://www.spinics.net/lists/linux-rdma/msg22381.html
>>
>
> I don't see a NACK in this thread.
>
> From http://www.spinics.net/lists/linux-rdma/msg22410.html:
> "The port space (which maps to the service ID) needs to be included as part of
> the check that determines the format of the private data, and not simply the
> address family."
>
> After such a state I would have expected to see a v2 of the patch with above
> comment addressed.
I might busy with other staff at that time, I will check again and
revisit the bug.

>
> Byte,
> Johannes
> --
> Johannes Thumshirn  Storage
> jthumsh...@suse.de+49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Regards,
-- 
Jack Wang
Linux Kernel Developer

ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin

Tel:   +49 30 577 008  042
Fax:  +49 30 577 008 299
Email:jinpu.w...@profitbricks.com
URL:  https://www.profitbricks.de

Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss


Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Johannes Thumshirn
On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote:
> >> +
> >> +#define XX(a) case (a): return #a
> >
> > please no macros with retun in them and XX isn't quite too descriptive as
> > well.
> >
> > [...]
> >
> >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> >> +{
> >> + switch (opcode) {
> >> + XX(IB_WC_SEND);
> >> + XX(IB_WC_RDMA_WRITE);
> >> + XX(IB_WC_RDMA_READ);
> >> + XX(IB_WC_COMP_SWAP);
> >> + XX(IB_WC_FETCH_ADD);
> >> + /* recv-side); inbound completion */
> >> + XX(IB_WC_RECV);
> >> + XX(IB_WC_RECV_RDMA_WITH_IMM);
> >> + default: return "IB_WC_OPCODE_UNKNOWN";
> >> + }
> >> +}
> >
> > How about:
> >
> > struct {
> > char *name;
> > enum ib_wc_opcode opcode;
> > } ib_wc_opcode_table[] = {
> > { stringyfy(IB_WC_SEND), IB_WC_SEND },
> > { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
> > { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
> > { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
> > { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
> > { stringyfy(IB_WC_RECV), IB_WC_RECV },
> > { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
> > { NULL, 0 },
> > };
> >
> > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> > {
> > int i;
> >
> > for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
> > if (ib_wc_opcode_table[i].opcode == opcode)
> > return ib_wc_opcode_table[i].name;
> >
> > return "IB_WC_OPCODE_UNKNOWN";
> > }
> >
> Looks nice, might be better to put it into ib_verbs.h?

Probably yes, as are your kvec functions for lib/iov_iter.c

[...]

> > What about resolving the kernel bug instead of making workarounds?
> I tried to send a patch upsteam, but was rejected by Sean.
> http://www.spinics.net/lists/linux-rdma/msg22381.html
> 

I don't see a NACK in this thread.

>From http://www.spinics.net/lists/linux-rdma/msg22410.html:
"The port space (which maps to the service ID) needs to be included as part of
the check that determines the format of the private data, and not simply the
address family." 

After such a state I would have expected to see a v2 of the patch with above
comment addressed.

Byte,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Jinpu Wang
On Fri, Mar 24, 2017 at 2:31 PM, Bart Van Assche
 wrote:
> On Fri, 2017-03-24 at 13:46 +0100, Jinpu Wang wrote:
>> Our IBNBD project was started 3 years ago based on our need for Cloud
>> Computing, NVMeOF is a bit younger.
>> - IBNBD is one of our components, part of our software defined storage 
>> solution.
>> - As I listed in features, IBNBD has it's own features
>>
>> We're planning to look more into NVMeOF, but it's not a replacement for 
>> IBNBD.
>
> Hello Jack, Danil and Roman,
>
> Thanks for having taken the time to open source this work and to travel to
> Boston to present this work at the Vault conference. However, my
> understanding of IBNBD is that this driver has several shortcomings neither
> NVMeOF nor iSER nor SRP have:
> * Doesn't scale in terms of number of CPUs submitting I/O. The graphs shown
>   during the Vault talk clearly illustrate this. This is probably the result
>   of sharing a data structure across all client CPUs, maybe the bitmap that
>   tracks which parts of the target buffer space are in use.
> * Supports IB but none of the other RDMA transports (RoCE / iWARP).
>
> We also need performance numbers that compare IBNBD against SRP and/or
> NVMeOF with memory registration disabled to see whether and how much faster
> IBNBD is compared to these two protocols.
>
> The fact that IBNBD only needs to messages per I/O is an advantage it has
> today over SRP but not over NVMeOF nor over iSER. The upstream initiator
> drivers for the latter two protocols already support inline data.
>
> Another question I have is whether integration with multipathd is supported?
> If multipathd tries to run scsi_id against an IBNBD client device that will
> fail.
>
> Thanks,
>
> Bart.
Hello Bart,

Thanks for your comments. As usual in house driver mainly covers needs
for ProfitBricks,
We only tested in our hardware environment. We only use IB not
RoCE/iWARP. The idea to
opensource is :
- Present our design/implementation/tradeoff, others might be interested.
- Attract more attention from developers/testers, so we can improve
the project better and faster.

We will gather performance data compare with NVMeOF in next submitting.

multipath is not supported, we're using APM for failover. (patch from
Mellanox developers)

Thanks,
-- 
Jack Wang
Linux Kernel Developer

ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin

Tel:   +49 30 577 008  042
Fax:  +49 30 577 008 299
Email:jinpu.w...@profitbricks.com
URL:  https://www.profitbricks.de

Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss


[GIT PULL] Block fixes for 4.11-rc

2017-03-24 Thread Jens Axboe
Hi Linus,

A few fixes for the current series that should go into -rc4. This pull
request contains:

- A fix for a potential corruption of un-started requests from Ming.

- A blk-stat fix from Omar, ensuring we flush the stat batch before
  checking nr_samples.

- A set of fixes from Sagi for the nvmeof family.

Please pull!


  git://git.kernel.dk/linux-block.git for-linus



Jens Axboe (1):
  Merge branch 'nvme-4.11-rc' of git://git.infradead.org/nvme into for-linus

Ming Lei (1):
  blk-mq: don't complete un-started request in timeout handler

Omar Sandoval (1):
  blk-stat: fix blk_stat_sum() if all samples are batched

Sagi Grimberg (5):
  nvme-loop: fix a possible use-after-free when destroying the admin queue
  nvmet: confirm sq percpu has scheduled and switched to atomic
  nvmet-rdma: Fix a possible uninitialized variable dereference
  nvme-rdma: handle cpu unplug when re-establishing the controller
  nvme-loop: handle cpu unplug when re-establishing the controller

 block/blk-mq.c  | 11 +-
 block/blk-stat.c|  4 +-
 drivers/nvme/host/rdma.c| 28 +++---
 drivers/nvme/target/core.c  | 11 +-
 drivers/nvme/target/loop.c  | 90 +
 drivers/nvme/target/nvmet.h |  1 +
 drivers/nvme/target/rdma.c  |  8 ++--
 7 files changed, 82 insertions(+), 71 deletions(-)

-- 
Jens Axboe



RE: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Steve Wise
> 
> From: Jack Wang 
> 
> This series introduces IBNBD/IBTRS kernel modules.
> 
> IBNBD (InfiniBand network block device) allows for an RDMA transfer of block
IO
> over InfiniBand network. The driver presents itself as a block device on
client
> side and transmits the block requests in a zero-copy fashion to the
server-side
> via InfiniBand. The server part of the driver converts the incoming buffers
back
> into BIOs and hands them down to the underlying block device. As soon as IO
> responses come back from the drive, they are being transmitted back to the
> client.

Hey Jack, why is this IB specific?  Can it work over iWARP transports as well?

Steve.





Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Bart Van Assche
On Fri, 2017-03-24 at 13:46 +0100, Jinpu Wang wrote:
> Our IBNBD project was started 3 years ago based on our need for Cloud
> Computing, NVMeOF is a bit younger.
> - IBNBD is one of our components, part of our software defined storage 
> solution.
> - As I listed in features, IBNBD has it's own features
> 
> We're planning to look more into NVMeOF, but it's not a replacement for IBNBD.

Hello Jack, Danil and Roman,

Thanks for having taken the time to open source this work and to travel to
Boston to present this work at the Vault conference. However, my
understanding of IBNBD is that this driver has several shortcomings neither
NVMeOF nor iSER nor SRP have:
* Doesn't scale in terms of number of CPUs submitting I/O. The graphs shown
  during the Vault talk clearly illustrate this. This is probably the result
  of sharing a data structure across all client CPUs, maybe the bitmap that
  tracks which parts of the target buffer space are in use.
* Supports IB but none of the other RDMA transports (RoCE / iWARP).

We also need performance numbers that compare IBNBD against SRP and/or
NVMeOF with memory registration disabled to see whether and how much faster
IBNBD is compared to these two protocols.

The fact that IBNBD only needs to messages per I/O is an advantage it has
today over SRP but not over NVMeOF nor over iSER. The upstream initiator
drivers for the latter two protocols already support inline data.

Another question I have is whether integration with multipathd is supported?
If multipathd tries to run scsi_id against an IBNBD client device that will
fail.

Thanks,

Bart.

Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Jinpu Wang
>> +
>> +#define XX(a) case (a): return #a
>
> please no macros with retun in them and XX isn't quite too descriptive as
> well.
>
> [...]
>
>> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> +{
>> + switch (opcode) {
>> + XX(IB_WC_SEND);
>> + XX(IB_WC_RDMA_WRITE);
>> + XX(IB_WC_RDMA_READ);
>> + XX(IB_WC_COMP_SWAP);
>> + XX(IB_WC_FETCH_ADD);
>> + /* recv-side); inbound completion */
>> + XX(IB_WC_RECV);
>> + XX(IB_WC_RECV_RDMA_WITH_IMM);
>> + default: return "IB_WC_OPCODE_UNKNOWN";
>> + }
>> +}
>
> How about:
>
> struct {
> char *name;
> enum ib_wc_opcode opcode;
> } ib_wc_opcode_table[] = {
> { stringyfy(IB_WC_SEND), IB_WC_SEND },
> { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
> { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
> { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
> { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
> { stringyfy(IB_WC_RECV), IB_WC_RECV },
> { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
> { NULL, 0 },
> };
>
> static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> {
> int i;
>
> for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
> if (ib_wc_opcode_table[i].opcode == opcode)
> return ib_wc_opcode_table[i].name;
>
> return "IB_WC_OPCODE_UNKNOWN";
> }
>
Looks nice, might be better to put it into ib_verbs.h?

>
> [...]
>
>> +/**
>> + * struct ibtrs_msg_hdr - Common header of all IBTRS messages
>> + * @type:Message type, valid values see: enum ibtrs_msg_types
>> + * @tsize:   Total size of transferred data
>> + *
>> + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug.
>> + * See IBNBD-610 for details
>
> What about resolving the kernel bug instead of making workarounds?
I tried to send a patch upsteam, but was rejected by Sean.
http://www.spinics.net/lists/linux-rdma/msg22381.html

>
>> + *
>> + * DO NOT CHANGE!
>> + */
>> +struct ibtrs_msg_hdr {
>> + u8  __padding1;
>> + u8  type;
>> + u16 __padding2;
>> + u32 tsize;
>> +};
>
> [...]
>
> --
> Johannes Thumshirn  Storage
> jthumsh...@suse.de+49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Thanks Johannes for review.


-- 
Jack Wang
Linux Kernel Developer

ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin

Tel:   +49 30 577 008  042
Fax:  +49 30 577 008 299
Email:jinpu.w...@profitbricks.com
URL:  https://www.profitbricks.de

Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss


Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Johannes Thumshirn
On Fri, Mar 24, 2017 at 01:46:02PM +0100, Jinpu Wang wrote:
> Hi Johnnes,
> 
> Our IBNBD project was started 3 years ago based on our need for Cloud
> Computing, NVMeOF is a bit younger.
> - IBNBD is one of our components, part of our software defined storage 
> solution.
> - As I listed in features, IBNBD has it's own features
> 
> We're planning to look more into NVMeOF, but it's not a replacement for IBNBD.

Ok thanks for the clarification.

Byte,
Johannes

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Jinpu Wang
On Fri, Mar 24, 2017 at 1:15 PM, Johannes Thumshirn  wrote:
> On Fri, Mar 24, 2017 at 11:45:15AM +0100, Jack Wang wrote:
>> From: Jack Wang 
>>
>> This series introduces IBNBD/IBTRS kernel modules.
>>
>> IBNBD (InfiniBand network block device) allows for an RDMA transfer of block 
>> IO
>> over InfiniBand network. The driver presents itself as a block device on 
>> client
>> side and transmits the block requests in a zero-copy fashion to the 
>> server-side
>> via InfiniBand. The server part of the driver converts the incoming buffers 
>> back
>> into BIOs and hands them down to the underlying block device. As soon as IO
>> responses come back from the drive, they are being transmitted back to the
>> client.
>>
>> We design and implement this solution based on our need for Cloud Computing,
>> the key features are:
>> - High throughput and low latency due to:
>> 1) Only two rdma messages per IO
>> 2) Simplified client side server memory management
>> 3) Eliminated SCSI sublayer
>> - Simple configuration and handling
>> 1) Server side is completely passive: volumes do not need to be
>> explicitly exported
>> 2) Only IB port GID and device path needed on client side to map
>> a block device
>> 3) A device can be remapped automatically i.e. after storage
>> reboot
>> - Pinning of IO-related processing to the CPU of the producer
>>
>> For usage please refer to Documentation/IBNBD.txt in later patch.
>> My colleague Danil Kpnis presents IBNBD in Vault-2017 about our 
>> design/feature/
>> tradeoff/performance:
>>
>> http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf
>>
>
> Hi Jack,
>
> Sorry to ask (I haven't attented the Vault presentation) but why can't you use
> NVMe over Fabrics in your environment? From what I see in your presentation
> and cover letter, it provides all you need and is in fact a standard Linux and
> Windows already have implemented.
>
> Thanks,
> Johannes
> --
> Johannes Thumshirn  Storage
> jthumsh...@suse.de+49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Hi Johnnes,

Our IBNBD project was started 3 years ago based on our need for Cloud
Computing, NVMeOF is a bit younger.
- IBNBD is one of our components, part of our software defined storage solution.
- As I listed in features, IBNBD has it's own features

We're planning to look more into NVMeOF, but it's not a replacement for IBNBD.

Thanks,
-- 
Jack Wang
Linux Kernel Developer

ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin

Tel:   +49 30 577 008  042
Fax:  +49 30 577 008 299
Email:jinpu.w...@profitbricks.com
URL:  https://www.profitbricks.de

Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss


[PATCH v2 3/4] block: rename blk_mq_freeze_queue_start()

2017-03-24 Thread Ming Lei
As the .q_usage_counter is used by both legacy and
mq path, we need to block new I/O if queue becomes
dead in blk_queue_enter().

So rename it and we can use this function in both
pathes.

Signed-off-by: Ming Lei 
---
 block/blk-core.c  |  2 +-
 block/blk-mq.c| 10 +-
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/nvme/host/core.c  |  2 +-
 include/linux/blk-mq.h|  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 44eed17319c0..5901133d105f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -670,7 +670,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
return -EBUSY;
 
/*
-* read pair of barrier in blk_mq_freeze_queue_start(),
+* read pair of barrier in blk_freeze_queue_start(),
 * we need to order reading DEAD flag of .q_usage_counter
 * and reading .mq_freeze_depth, otherwise the following
 * wait may never return if the two read are reordered.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b36f0481ba0e..5370b4f750ff 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx 
*hctx,
sbitmap_clear_bit(>ctx_map, ctx->index_hw);
 }
 
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
 {
int freeze_depth;
 
@@ -78,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
blk_mq_run_hw_queues(q, false);
}
 }
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
 void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
@@ -108,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q)
 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 * exported to drivers as the only user for unfreeze is blk_mq.
 */
-   blk_mq_freeze_queue_start(q);
+   blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
 }
 
@@ -746,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 * percpu_ref_tryget directly, because we need to be able to
 * obtain a reference even in the short window between the queue
 * starting to freeze, by dropping the first reference in
-* blk_mq_freeze_queue_start, and the moment the last request is
+* blk_freeze_queue_start, and the moment the last request is
 * consumed, marked by the instant q_usage_counter reaches
 * zero.
 */
@@ -2376,7 +2376,7 @@ static void blk_mq_queue_reinit_work(void)
 * take place in parallel.
 */
list_for_each_entry(q, _q_list, all_q_node)
-   blk_mq_freeze_queue_start(q);
+   blk_freeze_queue_start(q);
list_for_each_entry(q, _q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index f96ab717534c..c96c35ab39df 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd)
dev_info(>pdev->dev, "device %s surprise removal\n",
dd->disk->disk_name);
 
-   blk_mq_freeze_queue_start(dd->queue);
+   blk_freeze_queue_start(dd->queue);
blk_mq_stop_hw_queues(dd->queue);
blk_mq_tagset_busy_iter(>tags, mtip_no_dev_cleanup, dd);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9b3b57fef446..4a6d7f408769 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2386,7 +2386,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 
mutex_lock(>namespaces_mutex);
list_for_each_entry(ns, >namespaces, list)
-   blk_mq_freeze_queue_start(ns->queue);
+   blk_freeze_queue_start(ns->queue);
mutex_unlock(>namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b3e201c8d4f..ea2e9dcd3aef 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 unsigned long timeout);
-- 
2.9.3



[PATCH v2 2/4] block: add a read barrier in blk_queue_enter()

2017-03-24 Thread Ming Lei
Without the barrier, reading DEAD flag of .q_usage_counter
and reading .mq_freeze_depth may be reordered, then the
following wait_event_interruptible() may never return.

Signed-off-by: Ming Lei 
---
 block/blk-core.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index ad388d5e309a..44eed17319c0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -669,6 +669,14 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
if (nowait)
return -EBUSY;
 
+   /*
+* read pair of barrier in blk_mq_freeze_queue_start(),
+* we need to order reading DEAD flag of .q_usage_counter
+* and reading .mq_freeze_depth, otherwise the following
+* wait may never return if the two read are reordered.
+*/
+   smp_rmb();
+
ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(>mq_freeze_depth) ||
blk_queue_dying(q));
-- 
2.9.3



[PATCH v2 4/4] block: block new I/O just after queue is set as dying

2017-03-24 Thread Ming Lei
Before commit 780db2071a(blk-mq: decouble blk-mq freezing
from generic bypassing), the dying flag is checked before
entering queue, and Tejun converts the checking into .mq_freeze_depth,
and assumes the counter is increased just after dying flag
is set. Unfortunately we doesn't do that in blk_set_queue_dying().

This patch calls blk_freeze_queue_start() in blk_set_queue_dying(),
so that we can block new I/O coming once the queue is set as dying.

Given blk_set_queue_dying() is always called in remove path
of block device, and queue will be cleaned up later, we don't
need to worry about undoing the counter.

Cc: Bart Van Assche 
Cc: Tejun Heo 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-core.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5901133d105f..f0dd9b0054ed 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -500,6 +500,9 @@ void blk_set_queue_dying(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
 
+   /* block new I/O coming */
+   blk_freeze_queue_start(q);
+
if (q->mq_ops)
blk_mq_wake_waiters(q);
else {
@@ -672,8 +675,9 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
/*
 * read pair of barrier in blk_freeze_queue_start(),
 * we need to order reading DEAD flag of .q_usage_counter
-* and reading .mq_freeze_depth, otherwise the following
-* wait may never return if the two read are reordered.
+* and reading .mq_freeze_depth or dying flag, otherwise
+* the following wait may never return if the two read
+* are reordered.
 */
smp_rmb();
 
-- 
2.9.3



[PATCH v2 1/4] blk-mq: comment on races related with timeout handler

2017-03-24 Thread Ming Lei
This patch adds comment on two races related with
timeout handler:

- requeue from queue busy vs. timeout
- rq free & reallocation vs. timeout

Both the races themselves and current solution aren't
explicit enough, so add comments on them.

Cc: Bart Van Assche 
Reviewed-by: Hannes Reinecke 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c212b9644a9f..b36f0481ba0e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -523,6 +523,15 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
+/*
+ * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * flag isn't set yet, so there may be race with timeout hanlder,
+ * but given rq->deadline is just set in .queue_rq() under
+ * this situation, the race won't be possible in reality because
+ * rq->timeout should be set as big enough to cover the window
+ * between blk_mq_start_request() called from .queue_rq() and
+ * clearing REQ_ATOM_STARTED here.
+ */
 static void __blk_mq_requeue_request(struct request *rq)
 {
struct request_queue *q = rq->q;
@@ -696,6 +705,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx 
*hctx,
if (!test_bit(REQ_ATOM_STARTED, >atomic_flags))
return;
 
+   /*
+* The rq being checked may have been freed and reallocated
+* out already here, we avoid this race by checking rq->deadline
+* and REQ_ATOM_COMPLETE flag together:
+*
+* - if rq->deadline is observed as new value because of
+*   reusing, the rq won't be timed out because of timing.
+* - if rq->deadline is observed as previous value,
+*   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
+*   because we put a barrier between setting rq->deadline
+*   and clearing the flag in blk_mq_start_request(), so
+*   this rq won't be timed out too.
+*/
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
blk_mq_rq_timed_out(rq, reserved);
-- 
2.9.3



[PATCH v2 0/4] block: misc changes

2017-03-24 Thread Ming Lei
Hi,

The 1st  patch add comments on blk-mq races with timeout handler.

The other 3 patches improves handling for dying queue:
- the 2nd one adds one barrier in blk_queue_enter() for
avoiding hanging caused by out-of-order
- the 3rd and 4th patches block new I/O entering queue
after queue is set as dying

V1:
- add comments on races related with timeout handler
- add Tested-by & Reviewed-by tag

thanks,
Ming

Ming Lei (4):
  blk-mq: comment on races related with timeout handler
  block: add a read barrier in blk_queue_enter()
  block: rename blk_mq_freeze_queue_start()
  block: block new I/O just after queue is set as dying

 block/blk-core.c  | 12 
 block/blk-mq.c| 32 +++-
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/nvme/host/core.c  |  2 +-
 include/linux/blk-mq.h|  2 +-
 5 files changed, 42 insertions(+), 8 deletions(-)

-- 
2.9.3



Re: [PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Johannes Thumshirn
On Fri, Mar 24, 2017 at 11:45:16AM +0100, Jack Wang wrote:
> From: Jack Wang 
> 
> Signed-off-by: Jack Wang 
> Signed-off-by: Kleber Souza 
> Signed-off-by: Danil Kipnis 
> Signed-off-by: Roman Pen 
> ---

[...]

> +
> +#define XX(a) case (a): return #a

please no macros with retun in them and XX isn't quite too descriptive as
well.

[...]

> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> +{
> + switch (opcode) {
> + XX(IB_WC_SEND);
> + XX(IB_WC_RDMA_WRITE);
> + XX(IB_WC_RDMA_READ);
> + XX(IB_WC_COMP_SWAP);
> + XX(IB_WC_FETCH_ADD);
> + /* recv-side); inbound completion */
> + XX(IB_WC_RECV);
> + XX(IB_WC_RECV_RDMA_WITH_IMM);
> + default: return "IB_WC_OPCODE_UNKNOWN";
> + }
> +}

How about:

struct {
char *name;
enum ib_wc_opcode opcode;
} ib_wc_opcode_table[] = {
{ stringyfy(IB_WC_SEND), IB_WC_SEND },
{ stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
{ stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
{ stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
{ stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
{ stringyfy(IB_WC_RECV), IB_WC_RECV },
{ stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
{ NULL, 0 },
};

static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
{
int i;

for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
if (ib_wc_opcode_table[i].opcode == opcode)
return ib_wc_opcode_table[i].name;

return "IB_WC_OPCODE_UNKNOWN";
}


[...]

> +/**
> + * struct ibtrs_msg_hdr - Common header of all IBTRS messages
> + * @type:Message type, valid values see: enum ibtrs_msg_types
> + * @tsize:   Total size of transferred data
> + *
> + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug.
> + * See IBNBD-610 for details

What about resolving the kernel bug instead of making workarounds?

> + *
> + * DO NOT CHANGE!
> + */
> +struct ibtrs_msg_hdr {
> + u8  __padding1;
> + u8  type;
> + u16 __padding2;
> + u32 tsize;
> +};

[...]

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Johannes Thumshirn
On Fri, Mar 24, 2017 at 11:45:15AM +0100, Jack Wang wrote:
> From: Jack Wang 
> 
> This series introduces IBNBD/IBTRS kernel modules.
> 
> IBNBD (InfiniBand network block device) allows for an RDMA transfer of block 
> IO
> over InfiniBand network. The driver presents itself as a block device on 
> client
> side and transmits the block requests in a zero-copy fashion to the 
> server-side
> via InfiniBand. The server part of the driver converts the incoming buffers 
> back
> into BIOs and hands them down to the underlying block device. As soon as IO
> responses come back from the drive, they are being transmitted back to the
> client.
> 
> We design and implement this solution based on our need for Cloud Computing,
> the key features are:
> - High throughput and low latency due to:
> 1) Only two rdma messages per IO
> 2) Simplified client side server memory management
> 3) Eliminated SCSI sublayer
> - Simple configuration and handling
> 1) Server side is completely passive: volumes do not need to be
> explicitly exported
> 2) Only IB port GID and device path needed on client side to map
> a block device
> 3) A device can be remapped automatically i.e. after storage
> reboot
> - Pinning of IO-related processing to the CPU of the producer
> 
> For usage please refer to Documentation/IBNBD.txt in later patch.
> My colleague Danil Kpnis presents IBNBD in Vault-2017 about our 
> design/feature/
> tradeoff/performance:
> 
> http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf
> 

Hi Jack,

Sorry to ask (I haven't attented the Vault presentation) but why can't you use
NVMe over Fabrics in your environment? From what I see in your presentation
and cover letter, it provides all you need and is in fact a standard Linux and
Windows already have implemented.

Thanks,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 5/8] nowait aio: return on congested block device

2017-03-24 Thread Goldwyn Rodrigues


On 03/16/2017 09:33 AM, Jens Axboe wrote:
> On 03/15/2017 03:51 PM, Goldwyn Rodrigues wrote:
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 0eeb99e..2e5cba2 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -2014,7 +2019,7 @@ blk_qc_t generic_make_request(struct bio *bio)
>>  do {
>>  struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>>  
>> -if (likely(blk_queue_enter(q, false) == 0)) {
>> +if (likely(blk_queue_enter(q, bio_flagged(bio, BIO_NOWAIT)) == 
>> 0)) {
>>  struct bio_list hold;
>>  struct bio_list lower, same;
>>  
>> @@ -2040,7 +2045,10 @@ blk_qc_t generic_make_request(struct bio *bio)
>>  bio_list_merge(_list_on_stack, );
>>  bio_list_merge(_list_on_stack, );
>>  } else {
>> -bio_io_error(bio);
>> +if (unlikely(bio_flagged(bio, BIO_NOWAIT)))
>> +bio_wouldblock_error(bio);
>> +else
>> +bio_io_error(bio);
> 
> This doesn't look right. What if the queue is dying, and BIO_NOWAIT just
> happened to be set?
> 

Yes, I need to add a condition here to check for blk_queue_dying(). Thanks.

> And you're missing wbt_wait() as well as a blocking point. Ditto in
> blk-mq.

wbt_wait() does not apply to WRITE_ODIRECT


> 
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 159187a..942ce8c 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -1518,6 +1518,8 @@ static blk_qc_t blk_mq_make_request(struct 
>> request_queue *q, struct bio *bio)
>>  rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
>>  if (unlikely(!rq)) {
>>  __wbt_done(q->rq_wb, wb_acct);
>> +if (bio && bio_flagged(bio, BIO_NOWAIT))
>> +bio_wouldblock_error(bio);
>>  return BLK_QC_T_NONE;
>>  }
>>  
> 
> This seems a little fragile now, since not both paths free the bio.
> 

Direct I/O should free the bios in bio_dio_complete(). I am not sure why
it would not free bio here originally, but IIRC, this path is for
bio==NULL only. So, with this patch we would get a rq==NULL here and
hence the bio_wouldblock_error() call.

-- 
Goldwyn


[PATCH 18/28] ibnbd_clt: add sysfs interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_client/ibnbd_clt_sysfs.c | 863 +++
 drivers/block/ibnbd_client/ibnbd_clt_sysfs.h |  64 ++
 2 files changed, 927 insertions(+)
 create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_sysfs.c
 create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_sysfs.h

diff --git a/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c 
b/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c
new file mode 100644
index 000..89d487c
--- /dev/null
+++ b/drivers/block/ibnbd_client/ibnbd_clt_sysfs.c
@@ -0,0 +1,863 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include "ibnbd_clt_sysfs.h"
+#include "ibnbd_clt.h"
+#include 
+#include 
+
+static struct kobject *ibnbd_kobject;
+static struct kobject *ibnbd_devices_kobject;
+static DEFINE_MUTEX(sess_lock);
+
+struct ibnbd_clt_dev_destroy_kobj_work {
+   struct ibnbd_dev*dev;
+   struct work_struct  work;
+};
+
+enum {
+   IBNBD_OPT_ERR   = 0,
+   IBNBD_OPT_SERVER= 1 << 0,
+   IBNBD_OPT_DEV_PATH  = 1 << 1,
+   IBNBD_OPT_ACCESS_MODE   = 1 << 3,
+   IBNBD_OPT_INPUT_MODE= 1 << 4,
+   IBNBD_OPT_IO_MODE   = 1 << 5,
+};
+
+static unsigned ibnbd_opt_mandatory[] = {
+   IBNBD_OPT_SERVER,
+   IBNBD_OPT_DEV_PATH,
+};
+
+static const match_table_t ibnbd_opt_tokens = {
+   {   IBNBD_OPT_SERVER,   "server=%s" },
+   {   IBNBD_OPT_DEV_PATH, "device_path=%s"},
+   {   IBNBD_OPT_ACCESS_MODE,  "access_mode=%s"},
+   {   IBNBD_OPT_INPUT_MODE,   "input_mode=%s" },
+   {   IBNBD_OPT_IO_MODE,  "io_mode=%s"},
+   {   IBNBD_OPT_ERR,  NULL},
+};
+
+/* remove new line from string */
+static void strip(char *s)
+{
+   char *p = s;
+
+   while (*s != '\0') {
+   if (*s != '\n')
+   *p++ = *s++;
+   else
+   ++s;
+   }
+   *p = '\0';
+}
+
+static int ibnbd_clt_parse_map_options(const char *buf, char *server_addr,
+  char *pathname,
+  enum ibnbd_access_mode *access_mode,
+  enum ibnbd_queue_mode *queue_mode,
+  enum ibnbd_io_mode *io_mode)
+{
+   char *options, *sep_opt;
+   char *p;
+   substring_t 

[PATCH 20/28] ibnbd_clt: add Makefile and Kconfig

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 drivers/block/Kconfig   |  2 ++
 drivers/block/Makefile  |  1 +
 drivers/block/ibnbd_client/Kconfig  | 16 
 drivers/block/ibnbd_client/Makefile |  5 +
 4 files changed, 24 insertions(+)
 create mode 100644 drivers/block/ibnbd_client/Kconfig
 create mode 100644 drivers/block/ibnbd_client/Makefile

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f744de7..c309e57 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -275,6 +275,8 @@ config BLK_DEV_CRYPTOLOOP
 
 source "drivers/block/drbd/Kconfig"
 
+source "drivers/block/ibnbd_client/Kconfig"
+
 config BLK_DEV_NBD
tristate "Network block device support"
depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..7da1813 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_BLK_DEV_HD)  += hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)  += xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)   += xen-blkback/
+obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client/
 obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
 obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
diff --git a/drivers/block/ibnbd_client/Kconfig 
b/drivers/block/ibnbd_client/Kconfig
new file mode 100644
index 000..162e4e1
--- /dev/null
+++ b/drivers/block/ibnbd_client/Kconfig
@@ -0,0 +1,16 @@
+config BLK_DEV_IBNBD_CLT
+   tristate "Network block device over Infiniband client support"
+   depends on INFINIBAND_IBTRS_CLT
+   ---help---
+ Saying Y here will allow your computer to be a client for network
+ block devices over Infiniband, i.e. it will be able to use block
+ devices exported by
+ servers (mount file systems on them etc.). Communication between
+ client and server works over Infiniband networking, but to the client
+ program this is hidden: it looks like a regular local file access to
+ a block device special file such as /dev/ibnbd0.
+
+ To compile this driver as a module, choose M here: the
+ module will be called ibnbd_client.
+
+ If unsure, say N.
diff --git a/drivers/block/ibnbd_client/Makefile 
b/drivers/block/ibnbd_client/Makefile
new file mode 100644
index 000..bbf211f
--- /dev/null
+++ b/drivers/block/ibnbd_client/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client.o
+
+ibnbd_client-y := ibnbd_clt.o ibnbd_clt_sysfs.o ../ibnbd_lib/ibnbd.o \
+  ../ibnbd_lib/ibnbd-proto.o
-- 
2.7.4



[PATCH 14/28] ibnbd: add headers shared by ibnbd_client and ibnbd_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_inc/ibnbd-proto.h | 273 ++
 drivers/block/ibnbd_inc/ibnbd.h   |  55 +++
 drivers/block/ibnbd_inc/log.h |  68 +
 3 files changed, 396 insertions(+)
 create mode 100644 drivers/block/ibnbd_inc/ibnbd-proto.h
 create mode 100644 drivers/block/ibnbd_inc/ibnbd.h
 create mode 100644 drivers/block/ibnbd_inc/log.h

diff --git a/drivers/block/ibnbd_inc/ibnbd-proto.h 
b/drivers/block/ibnbd_inc/ibnbd-proto.h
new file mode 100644
index 000..4838177
--- /dev/null
+++ b/drivers/block/ibnbd_inc/ibnbd-proto.h
@@ -0,0 +1,273 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBNBD_PROTO_H
+#define __IBNBD_PROTO_H
+#include 
+#include "ibnbd.h"
+
+#define IBNBD_VERSION 1
+
+#define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 6))
+#if GCC_DIAGNOSTIC_AWARE
+#pragma GCC diagnostic push
+#pragma GCC diagnostic warning "-Wpadded"
+#endif
+
+/**
+ * enum ibnbd_msg_types - IBNBD message types
+ * @IBNBD_MSG_SESS_INFO:   initial session info from client to server
+ * @IBNBD_MSG_SESS_INFO_RSP:   initial session info from server to client
+ * @IBNBD_MSG_OPEN:open connection to ibnbd server instance
+ * @IBNBD_MSG_OPEN_RSP:response to an @IBNBD_MSG_OPEN
+ * @IBNBD_MSG_READ:request block device read operation
+ * @IBNBD_MSG_WRITE:   request block device write operation
+ * @IBNBD_MSG_REVAL:   notify client about changed device size
+ *
+ * Note: DO NOT REORDER THE MEMBERS OF THIS ENUM!
+ * If necessary, add new members after the last one.
+ */
+enum ibnbd_msg_type {
+   __IBNBD_MSG_MIN,
+   IBNBD_MSG_SESS_INFO,
+   IBNBD_MSG_SESS_INFO_RSP,
+   IBNBD_MSG_OPEN,
+   IBNBD_MSG_OPEN_RSP,
+   IBNBD_MSG_IO,
+   IBNBD_MSG_CLOSE,
+   IBNBD_MSG_CLOSE_RSP,
+   IBNBD_MSG_REVAL,
+   __IBNBD_MSG_MAX
+};
+
+/**
+ * struct ibnbd_msg_hdr - header of IBNBD messages
+ * @type:  Message type, valid values see: enum ibnbd_msg_types
+ */
+struct ibnbd_msg_hdr {
+   u16 type;
+   u16 __padding;
+};
+
+enum ibnbd_access_mode {
+   IBNBD_ACCESS_RO,
+   IBNBD_ACCESS_RW,
+   IBNBD_ACCESS_MIGRATION,
+};
+
+#define _IBNBD_FILEIO  0
+#define _IBNBD_BLOCKIO 1
+#define _IBNBD_AUTOIO  2
+
+enum ibnbd_io_mode {
+   IBNBD_FILEIO = _IBNBD_FILEIO,
+   IBNBD_BLOCKIO = _IBNBD_BLOCKIO,
+   IBNBD_AUTOIO = _IBNBD_AUTOIO,
+};
+
+/**
+ * struct ibnbd_msg_sess_info 

[PATCH 13/28] ibtrs_srv: add Makefile and Kconfig

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 drivers/infiniband/Kconfig   | 1 +
 drivers/infiniband/ulp/Makefile  | 1 +
 drivers/infiniband/ulp/ibtrs_server/Kconfig  | 8 
 drivers/infiniband/ulp/ibtrs_server/Makefile | 6 ++
 4 files changed, 16 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/Kconfig
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/Makefile

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index cb1b864..07aa050 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -86,6 +86,7 @@ source "drivers/infiniband/ulp/iser/Kconfig"
 source "drivers/infiniband/ulp/isert/Kconfig"
 
 source "drivers/infiniband/ulp/ibtrs_client/Kconfig"
+source "drivers/infiniband/ulp/ibtrs_server/Kconfig"
 
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
index acd8ce6..eb4da3f 100644
--- a/drivers/infiniband/ulp/Makefile
+++ b/drivers/infiniband/ulp/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_INFINIBAND_SRPT)   += srpt/
 obj-$(CONFIG_INFINIBAND_ISER)  += iser/
 obj-$(CONFIG_INFINIBAND_ISERT) += isert/
 obj-$(CONFIG_INFINIBAND_IBTRS_CLT)  += ibtrs_client/
+obj-$(CONFIG_INFINIBAND_IBTRS_SRV)  += ibtrs_server/
diff --git a/drivers/infiniband/ulp/ibtrs_server/Kconfig 
b/drivers/infiniband/ulp/ibtrs_server/Kconfig
new file mode 100644
index 000..6fbdc54
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_server/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_IBTRS_SRV
+   tristate "InfiniBand IBTRS SERVER"
+   depends on INFINIBAND_ADDR_TRANS
+   ---help---
+ Support for the simplified data transfer over InfiniBand.
+ This offer API to user module IBNBD_SERVER
+
+ The IBTRS protocol is defined by the ProfitBricks GmbH.
diff --git a/drivers/infiniband/ulp/ibtrs_server/Makefile 
b/drivers/infiniband/ulp/ibtrs_server/Makefile
new file mode 100644
index 000..39d9e1d
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_server/Makefile
@@ -0,0 +1,6 @@
+
+obj-$(CONFIG_INFINIBAND_IBTRS_SRV) += ibtrs_server.o
+
+ibtrs_server-y := ibtrs_srv.o ibtrs_srv_sysfs.o \
+  ../ibtrs_lib/ibtrs.o ../ibtrs_lib/ibtrs-proto.o 
../ibtrs_lib/iu.o \
+  ../ibtrs_lib/heartbeat.o ../ibtrs_lib/common.o
-- 
2.7.4



[PATCH 15/28] ibnbd: add shared library functions

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_lib/ibnbd-proto.c | 244 ++
 drivers/block/ibnbd_lib/ibnbd.c   | 108 +++
 2 files changed, 352 insertions(+)
 create mode 100644 drivers/block/ibnbd_lib/ibnbd-proto.c
 create mode 100644 drivers/block/ibnbd_lib/ibnbd.c

diff --git a/drivers/block/ibnbd_lib/ibnbd-proto.c 
b/drivers/block/ibnbd_lib/ibnbd-proto.c
new file mode 100644
index 000..c6d83f2
--- /dev/null
+++ b/drivers/block/ibnbd_lib/ibnbd-proto.c
@@ -0,0 +1,244 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include "../ibnbd_inc/ibnbd-proto.h"
+#include "../ibnbd_inc/log.h"
+
+static int ibnbd_validate_msg_sess_info(const struct ibnbd_msg_sess_info *msg,
+   size_t len)
+{
+   if (unlikely(len != sizeof(*msg))) {
+   ERR_NP("Sess info message with unexpected length received"
+  " %lu instead of %lu\n", len, sizeof(*msg));
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static int
+ibnbd_validate_msg_sess_info_rsp(const struct ibnbd_msg_sess_info_rsp *msg,
+size_t len)
+{
+   if (unlikely(len != sizeof(*msg))) {
+   ERR_NP("Sess info message with unexpected length received"
+  " %lu instead of %lu\n", len, sizeof(*msg));
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static int ibnbd_validate_msg_open_resp(const struct ibnbd_msg_open_rsp *msg,
+   size_t len)
+{
+   if (unlikely(msg->result))
+   return 0;
+
+   if (unlikely(len != sizeof(*msg))) {
+   ERR_NP("Open Response msg received with unexpected length"
+  " %zuB instead of %luB\n", len, sizeof(*msg));
+   return -EINVAL;
+   }
+
+   if (unlikely(!msg->logical_block_size)) {
+   ERR_NP("Open Resp msg received with unexpected with"
+  " invalid logical_block_size value %d\n",
+  msg->logical_block_size);
+   return -EINVAL;
+   }
+
+   if (unlikely(!msg->physical_block_size)) {
+   ERR_NP("Open Resp msg received with invalid"
+  " physical_block_size value %d\n",
+  msg->physical_block_size);
+   return -EINVAL;
+   }
+
+   if (unlikely(!msg->max_hw_sectors)) {
+   ERR_NP("Open Resp msg received with 

[PATCH 11/28] ibtrs_srv: add header shared in ibtrs_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 .../ulp/ibtrs_server/ibtrs_srv_internal.h  | 201 +
 1 file changed, 201 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h

diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h 
b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h
new file mode 100644
index 000..79130a1
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_internal.h
@@ -0,0 +1,201 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef _IBTRS_SRV_INTERNAL_H
+#define _IBTRS_SRV_INTERNAL_H
+
+#include 
+
+enum ssm_state {
+   SSM_STATE_IDLE,
+   SSM_STATE_CONNECTED,
+   SSM_STATE_CLOSING,
+   SSM_STATE_CLOSED
+};
+
+/*
+ * Describes the rdma buffer managed by client and used for his rdma writes
+ * Rdma info has to be sent in OPEN_RESP message to the client.
+ */
+struct ibtrs_rcv_buf {
+   dma_addr_t  rdma_addr;
+   void*buf;
+};
+
+/* to indicate that memory chunk was not allocated from a N-order contiguous
+ * pages area
+ */
+#define IBTRS_MEM_CHUNK_NOORDER -1
+
+struct ibtrs_mem_chunk {
+   struct list_headlist;
+   int order;
+   void*addr;
+};
+
+struct ibtrs_rcv_buf_pool {
+   struct list_headlist;
+   struct list_headchunk_list;
+   struct ibtrs_rcv_buf*rcv_bufs;
+};
+
+struct ibtrs_stats_wc_comp {
+   atomic_tmax_wc_cnt;
+   atomic64_t  calls;
+   atomic64_t  total_wc_cnt;
+};
+
+struct ibtrs_srv_stats_rdma_stats {
+   atomic64_t  cnt_read;
+   atomic64_t  size_total_read;
+   atomic64_t  cnt_write;
+   atomic64_t  size_total_write;
+
+   atomic_tinflight;
+   atomic64_t  inflight_total;
+};
+
+struct ibtrs_srv_stats_user_ib_msgs {
+   atomic64_t recv_msg_cnt;
+   atomic64_t sent_msg_cnt;
+   atomic64_t recv_size;
+   atomic64_t sent_size;
+};
+
+struct ibtrs_srv_stats {
+   struct ibtrs_srv_stats_rdma_stats   rdma_stats;
+   struct ibtrs_srv_stats_user_ib_msgs user_ib_msgs;
+   atomic_tapm_cnt;
+   struct ibtrs_stats_wc_comp  wc_comp;
+};
+
+struct ibtrs_session {
+   struct list_headlist;
+   enum ssm_state  state;
+   struct kref kref;
+   struct workqueue_struct *sm_wq; /* event processing */
+   struct 

[PATCH 23/28] ibnbd_srv: add abstraction for submit IO to file or block device

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_server/ibnbd_dev.c | 436 +
 drivers/block/ibnbd_server/ibnbd_dev.h | 149 +++
 2 files changed, 585 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/ibnbd_dev.c
 create mode 100644 drivers/block/ibnbd_server/ibnbd_dev.h

diff --git a/drivers/block/ibnbd_server/ibnbd_dev.c 
b/drivers/block/ibnbd_server/ibnbd_dev.c
new file mode 100644
index 000..5f6b453
--- /dev/null
+++ b/drivers/block/ibnbd_server/ibnbd_dev.c
@@ -0,0 +1,436 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include "ibnbd_dev.h"
+#include "ibnbd_srv_log.h"
+
+#define IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS 0
+
+struct ibnbd_dev_file_io_work {
+   struct ibnbd_dev*dev;
+   void*priv;
+
+   sector_tsector;
+   void*data;
+   size_t  len;
+   size_t  bi_size;
+   enum ibnbd_io_flags flags;
+
+   struct work_struct  work;
+};
+
+struct ibnbd_dev_blk_io {
+   struct ibnbd_dev *dev;
+   void *priv;
+};
+
+static struct workqueue_struct *fileio_wq;
+
+int ibnbd_dev_init(void)
+{
+   fileio_wq = alloc_workqueue("%s", WQ_UNBOUND,
+   IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS,
+   "ibnbd_server_fileio_wq");
+   if (!fileio_wq)
+   return -ENOMEM;
+
+   return 0;
+}
+
+void ibnbd_dev_destroy(void)
+{
+   destroy_workqueue(fileio_wq);
+}
+
+static inline struct block_device *ibnbd_dev_open_bdev(const char *path,
+  fmode_t flags)
+{
+   return blkdev_get_by_path(path, flags, THIS_MODULE);
+}
+
+static int ibnbd_dev_blk_open(struct ibnbd_dev *dev, const char *path,
+ fmode_t flags)
+{
+   dev->bdev = ibnbd_dev_open_bdev(path, flags);
+   return PTR_ERR_OR_ZERO(dev->bdev);
+}
+
+static int ibnbd_dev_vfs_open(struct ibnbd_dev *dev, const char *path,
+ fmode_t flags)
+{
+   int oflags = O_DSYNC; /* enable write-through */
+
+   if (flags & FMODE_WRITE)
+   oflags |= O_RDWR;
+   else if (flags & FMODE_READ)
+   oflags |= O_RDONLY;
+   else
+   return -EINVAL;
+
+   dev->file = filp_open(path, oflags, 0);
+   return PTR_ERR_OR_ZERO(dev->file);
+}
+
+struct ibnbd_dev *ibnbd_dev_open(const char 

[PATCH 19/28] ibnbd_clt: add log helpers

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_client/ibnbd_clt_log.h | 79 ++
 1 file changed, 79 insertions(+)
 create mode 100644 drivers/block/ibnbd_client/ibnbd_clt_log.h

diff --git a/drivers/block/ibnbd_client/ibnbd_clt_log.h 
b/drivers/block/ibnbd_client/ibnbd_clt_log.h
new file mode 100644
index 000..b3184b7
--- /dev/null
+++ b/drivers/block/ibnbd_client/ibnbd_clt_log.h
@@ -0,0 +1,79 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBNBD_CLT_LOG_H__
+#define __IBNBD_CLT_LOG_H__
+
+#include "../ibnbd_inc/log.h"
+
+#define blkdev_name(dev) ((dev->gd == NULL) ? "" : dev->gd->disk_name)
+
+#define ERR(dev, fmt, ...) pr_err("ibnbd L%d <%s@%s> %s ERR: " fmt,\
+   __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   blkdev_name(dev), ##__VA_ARGS__)
+
+#define ERR_RL(dev, fmt, ...) pr_err_ratelimited("ibnbd L%d <%s@%s> %s ERR: "\
+   fmt, __LINE__, dev->pathname,\
+   ibnbd_prefix(dev), blkdev_name(dev),\
+   ##__VA_ARGS__)
+
+#define WRN(dev, fmt, ...) pr_warn("ibnbd L%d <%s@%s> %s WARN: " fmt,\
+   __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   blkdev_name(dev), ##__VA_ARGS__)
+
+#define WRN_RL(dev, fmt, ...) pr_warn_ratelimited("ibnbd L%d <%s@%s> %s WARN: 
"\
+   fmt, __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   blkdev_name(dev), ##__VA_ARGS__)
+
+#define INFO(dev, fmt, ...) pr_info("ibnbd <%s@%s> %s: " \
+   fmt, dev->pathname, ibnbd_prefix(dev),\
+   blkdev_name(dev), ##__VA_ARGS__)
+
+#define INFO_RL(dev, fmt, ...) pr_info_ratelimited("ibnbd <%s@%s> %s: " \
+   fmt, dev->pathname, ibnbd_prefix(dev),\
+   blkdev_name(dev), ##__VA_ARGS__)
+
+#endif /*__IBNBD_CLT_LOG_H__*/
-- 
2.7.4



[PATCH 27/28] ibnbd: add doc for how to use ibnbd and sysfs interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 Documentation/IBNBD.txt | 284 
 1 file changed, 284 insertions(+)
 create mode 100644 Documentation/IBNBD.txt

diff --git a/Documentation/IBNBD.txt b/Documentation/IBNBD.txt
new file mode 100644
index 000..f7f490a
--- /dev/null
+++ b/Documentation/IBNBD.txt
@@ -0,0 +1,284 @@
+Infiniband Network Block Device (IBNBD)
+===
+
+Introduction
+
+
+IBNBD (InfiniBand Network Block Device) is a pair of kernel modules (client and
+server) that allows to access a remote storage device on the server from
+clients via an InfiniBand network.
+Mapped storage devices appear transparent for the client, acting as any other
+regular storage devices.
+
+The data transport between client and server over the InfiniBand network
+is performed by the IBTRS (InfiniBand Transport) kernel modules.
+
+The administration of these modules is done via sysfs. A Command-line tool
+(ibnbd-cli) is also available for a more user-friendly experience.
+
+Requirements
+
+  - IBTRS kernel modules (available as git-submodule)
+
+Quick Start
+---
+Server:
+  # insmod ibtrs/ibtrs_server/ibtrs_server.ko
+  # insmod ibnbd_server/ibnbd_server.ko
+
+Client:
+  # insmod ibtrs/ibtrs_client/ibtrs_client.ko
+  # insmod ibnbd_client/ibnbd_client.ko
+  # echo "server= device_path=" > 
/sys/kernel/ibnbd/map_device
+
+The block device  will become available on the client as
+/dev/ibnbd. It can be used like a local block device.
+
+Client Userspace Interface
+--
+This chapter describes only the most important files of Userspace Interface.
+A full documentation can be found in the Architecture Documentation.
+
+All sysfs files that are not read-only will return a usage information if they
+are read.
+
+example:
+  $ cat /sys/kernel/ibnbd/map_device
+
+
+/sys/kernel/ibnbd/ entries
+~~
+
+map_device (RW)
+^^^
+To map a volume on the client, information about the device has to be written
+to:
+  /sys/kernel/ibnbd/map_device
+
+The format of the input is:
+  "server= device_path=
+   [access_mode= 
/sys/kernel/ibnbd/map_device
+  # echo "server=ip:10.50.100.64 
device_path=3F2504E0-4F89-41D3-9A0C-0305E82C3301" > /sys/kernel/ibnbd/map_device
+
+Finding device file after mapping
++
+After mapping, the device file can be found by:
+1.) The symlink /sys/kernel/ibnbd/devices/ points to
+/sys/block/.
+The last part of the symlink destination is the same than the device name.
+By extracting the last part of the path the path to the device
+/dev/ can be build.
+2.) /dev/block/$(cat /sys/kernel/ibnbd/devices//dev)
+
+How to find the  of the device is described on the next chapter
+(devices/ directory).
+
+devices/ (DIRECTORY)

[PATCH 28/28] MAINTRAINERS: Add maintainer for IBNBD/IBTRS

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 MAINTAINERS | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c776906..12a528a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6263,6 +6263,20 @@ IBM ServeRAID RAID DRIVER
 S: Orphan
 F: drivers/scsi/ips.*
 
+IBTRS TRANSPORT DRIVERS
+M:  Jack Wang 
+L:  linux-r...@vger.kernel.org
+S:  Maintained
+F:  include/linux/ibtrs*.h
+F:  drivers/infiniband/ulp/ibtrs*
+
+IBNBD BLOCK DRIVERS
+M:  Jack Wang 
+L:  linux-r...@vger.kernel.org
+S:  Maintained
+F: Documentation/IBNBD.txt
+F:  drivers/block/ibnbd*
+
 ICH LPC AND GPIO DRIVER
 M: Peter Tyser 
 S: Maintained
-- 
2.7.4



[PATCH 25/28] ibnbd_srv: add sysfs interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_server/ibnbd_srv_sysfs.c | 317 +++
 drivers/block/ibnbd_server/ibnbd_srv_sysfs.h |  64 ++
 2 files changed, 381 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_sysfs.c
 create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_sysfs.h

diff --git a/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c 
b/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c
new file mode 100644
index 000..8774abe
--- /dev/null
+++ b/drivers/block/ibnbd_server/ibnbd_srv_sysfs.c
@@ -0,0 +1,317 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../ibnbd_inc/ibnbd.h"
+#include "ibnbd_srv.h"
+#include "ibnbd_srv_log.h"
+#include "ibnbd_srv_sysfs.h"
+
+static struct kobject *ibnbd_srv_kobj;
+static struct kobject *ibnbd_srv_devices_kobj;
+#define IBNBD_SYSFS_DIR "ibnbd"
+static char ibnbd_sysfs_dir[64] = IBNBD_SYSFS_DIR;
+
+static ssize_t ibnbd_srv_revalidate_dev_show(struct kobject *kobj,
+struct kobj_attribute *attr,
+char *page)
+{
+   return scnprintf(page, PAGE_SIZE,
+"Usage: echo 1 > %s\n", attr->attr.name);
+}
+
+static ssize_t ibnbd_srv_revalidate_dev_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+   int ret;
+   struct ibnbd_srv_dev *dev = container_of(kobj, struct ibnbd_srv_dev,
+dev_kobj);
+
+   if (!sysfs_streq(buf, "1")) {
+   ERR_NP("%s: invalid value: '%s'\n", attr->attr.name, buf);
+   return -EINVAL;
+   }
+   ret = ibnbd_srv_revalidate_dev(dev);
+   if (ret)
+   return ret;
+
+   return count;
+}
+
+static struct kobj_attribute ibnbd_srv_revalidate_dev_attr =
+   __ATTR(revalidate,
+  0644,
+  ibnbd_srv_revalidate_dev_show,
+  ibnbd_srv_revalidate_dev_store);
+
+static struct attribute *ibnbd_srv_default_dev_attrs[] = {
+   _srv_revalidate_dev_attr.attr,
+   NULL,
+};
+
+static struct attribute_group ibnbd_srv_default_dev_attr_group = {
+   .attrs = 

[PATCH 21/28] ibnbd_srv: add header shared in ibnbd_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_server/ibnbd_srv.h | 115 +
 1 file changed, 115 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/ibnbd_srv.h

diff --git a/drivers/block/ibnbd_server/ibnbd_srv.h 
b/drivers/block/ibnbd_server/ibnbd_srv.h
new file mode 100644
index 000..764a31f
--- /dev/null
+++ b/drivers/block/ibnbd_server/ibnbd_srv.h
@@ -0,0 +1,115 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef _IBNBD_SRV_H
+#define _IBNBD_SRV_H
+
+#include 
+#include 
+#include 
+#include "../ibnbd_inc/ibnbd.h"
+#include "../ibnbd_inc/ibnbd-proto.h"
+#include 
+
+enum sess_state {
+   SESS_STATE_CONNECTED,
+   SESS_STATE_DISCONNECTED
+};
+
+struct ibnbd_srv_session {
+   struct list_headlist; /* for the global sess_list */
+   struct ibtrs_session*ibtrs_sess;
+   charstr_addr[IBTRS_ADDRLEN];
+   charhostname[MAXHOSTNAMELEN];
+   int queue_depth;
+   enum sess_state state;
+   struct bio_set  *sess_bio_set;
+
+   rwlock_tindex_lock cacheline_aligned;
+   struct idr  index_idr;
+   struct mutexlock; /* protects sess_dev_list */
+   struct list_headsess_dev_list; /* list of struct 
ibnbd_srv_sess_dev */
+   u8  ver; /* IBNBD protocol version */
+};
+
+struct ibnbd_srv_dev {
+   struct list_headlist; /* global dev_list */
+
+   struct kobject  dev_kobj;
+   struct kobject  dev_clients_kobj;
+
+   struct kref kref;
+   charid[NAME_MAX];
+
+   struct mutexlock; /* protects sess_dev_list and 
open_write_cnt */
+   struct list_headsess_dev_list; /* list of struct 
ibnbd_srv_sess_dev */
+   int open_write_cnt;
+   enum ibnbd_io_mode  mode;
+};
+
+struct ibnbd_srv_sess_dev {
+   struct list_headdev_list; /* for struct 
ibnbd_srv_dev->sess_dev_list */
+   struct list_headsess_list; /* for struct 
ibnbd_srv_session->sess_dev_list */
+
+   struct ibnbd_dev*ibnbd_dev;
+   struct ibnbd_srv_session*sess;
+   struct ibnbd_srv_dev*dev;
+   struct kobject  kobj;
+   struct completion   

[PATCH 17/28] ibnbd_clt: add header shared in ibnbd_client

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_client/ibnbd_clt.h | 231 +
 1 file changed, 231 insertions(+)
 create mode 100644 drivers/block/ibnbd_client/ibnbd_clt.h

diff --git a/drivers/block/ibnbd_client/ibnbd_clt.h 
b/drivers/block/ibnbd_client/ibnbd_clt.h
new file mode 100644
index 000..3f0db78
--- /dev/null
+++ b/drivers/block/ibnbd_client/ibnbd_clt.h
@@ -0,0 +1,231 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef _IBNBD_CLT_H
+#define _IBNBD_CLT_H
+#include 
+#include /* for wait_queue_head_t */
+#include   /* for sockaddr_in */
+#include /* for sockaddr_in */
+#include 
+#include "ibnbd_clt_log.h"
+#include "../ibnbd_inc/ibnbd.h"
+#include "../ibnbd_inc/ibnbd-proto.h"  /* ibnbd protocol messages */
+#include /* for ibtrs api */
+#include 
+
+#define IP_PREFIX "ip:"
+#define IP_PREFIX_LEN strlen(IP_PREFIX)
+#define GID_PREFIX "gid:"
+#define GID_PREFIX_LEN strlen(GID_PREFIX)
+
+#define BMAX_SEGMENTS 31
+#define RECONNECT_DELAY 30
+#define MAX_RECONNECTS -1
+
+enum ibnbd_dev_state {
+   DEV_STATE_INIT,
+   DEV_STATE_INIT_CLOSED,
+   DEV_STATE_CLOSED,
+   DEV_STATE_UNMAPPED,
+   DEV_STATE_OPEN
+};
+
+enum ibnbd_queue_mode {
+   BLK_MQ,
+   BLK_RQ
+};
+
+struct ibnbd_iu {
+   struct request  *rq;
+   struct ibtrs_tag*tag;
+   struct ibnbd_dev*dev;
+   struct ibnbd_msg_io msg;
+   int errno;
+   struct scatterlist  sglist[BMAX_SEGMENTS];
+};
+
+struct ibnbd_cpu_qlist {
+   struct list_headrequeue_list;
+   spinlock_t  requeue_lock;
+   unsigned intcpu;
+};
+
+enum sess_state {
+   SESS_STATE_READY,
+   SESS_STATE_DISCONNECTED,
+   SESS_STATE_DESTROYED,
+};
+
+struct ibnbd_session {
+   struct list_headlist;
+   struct ibtrs_session*sess;
+   struct ibnbd_cpu_qlist  __percpu
+   *cpu_queues;
+   DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
+   int __percpu*cpu_rr; /* per-cpu var for CPU round-robin */
+   atomic_tbusy;
+   int queue_depth;
+   u32 max_io_size;
+   struct blk_mq_tag_set   tag_set;
+   struct mutexlock; /* protects state and devs_list */
+   struct list_headdevs_list; /* list of struct ibnbd_dev */
+   

[PATCH 26/28] ibnbd_srv: add Makefile and Kconfig

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 drivers/block/Kconfig   |  1 +
 drivers/block/Makefile  |  1 +
 drivers/block/ibnbd_server/Kconfig  | 16 
 drivers/block/ibnbd_server/Makefile |  3 +++
 4 files changed, 21 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/Kconfig
 create mode 100644 drivers/block/ibnbd_server/Makefile

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index c309e57..e4823c4 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -276,6 +276,7 @@ config BLK_DEV_CRYPTOLOOP
 source "drivers/block/drbd/Kconfig"
 
 source "drivers/block/ibnbd_client/Kconfig"
+source "drivers/block/ibnbd_server/Kconfig"
 
 config BLK_DEV_NBD
tristate "Network block device support"
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7da1813..cd20888 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BLK_DEV_HD)  += hd.o
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)  += xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)   += xen-blkback/
 obj-$(CONFIG_BLK_DEV_IBNBD_CLT)+= ibnbd_client/
+obj-$(CONFIG_BLK_DEV_IBNBD_SRV)+= ibnbd_server/
 obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
 obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
diff --git a/drivers/block/ibnbd_server/Kconfig 
b/drivers/block/ibnbd_server/Kconfig
new file mode 100644
index 000..943e1b2
--- /dev/null
+++ b/drivers/block/ibnbd_server/Kconfig
@@ -0,0 +1,16 @@
+config BLK_DEV_IBNBD_SRV
+   tristate "Network block device over Infiniband server support"
+   depends on INFINIBAND_IBTRS_SRV
+   ---help---
+ Saying Y here will allow your computer to be a server for network
+ block devices over Infiniband, i.e. it will be able to use block
+ devices exported by servers (mount file systems on them etc.).
+ Communication between client and server works over Infiniband
+ networking, but to the client program this is hidden:
+ it looks like a regular local file access to a block device
+ special file such as /dev/ibnbd0.
+
+ To compile this driver as a module, choose M here: the
+ module will be called ibnbd_client.
+
+ If unsure, say N.
diff --git a/drivers/block/ibnbd_server/Makefile 
b/drivers/block/ibnbd_server/Makefile
new file mode 100644
index 000..e66860f
--- /dev/null
+++ b/drivers/block/ibnbd_server/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_BLK_DEV_IBNBD_SRV) += ibnbd_server.o
+ibnbd_server-objs  := ibnbd_srv.o ibnbd_srv_sysfs.o ibnbd_dev.o \
+   ../ibnbd_lib/ibnbd.o ../ibnbd_lib/ibnbd-proto.o
-- 
2.7.4



[PATCH 24/28] ibnbd_srv: add log helpers

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
---
 drivers/block/ibnbd_server/ibnbd_srv_log.h | 69 ++
 1 file changed, 69 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/ibnbd_srv_log.h

diff --git a/drivers/block/ibnbd_server/ibnbd_srv_log.h 
b/drivers/block/ibnbd_server/ibnbd_srv_log.h
new file mode 100644
index 000..9217804
--- /dev/null
+++ b/drivers/block/ibnbd_server/ibnbd_srv_log.h
@@ -0,0 +1,69 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBNBD_SRV_LOG_H__
+#define __IBNBD_SRV_LOG_H__
+
+#include "../ibnbd_inc/log.h"
+
+#define ERR(dev, fmt, ...) pr_err("ibnbd L%d <%s@%s> ERR: " fmt, \
+   __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   ##__VA_ARGS__)
+#define ERR_RL(dev, fmt, ...) pr_err_ratelimited("ibnbd L%d <%s@%s> ERR: " 
fmt,\
+   __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   ##__VA_ARGS__)
+#define WRN(dev, fmt, ...) pr_warn("ibnbd L%d <%s@%s> WARN: " fmt,\
+   __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   ##__VA_ARGS__)
+#define WRN_RL(dev, fmt, ...) pr_warn_ratelimited("ibnbd L%d <%s@%s> WARN: " \
+   fmt, __LINE__, dev->pathname, ibnbd_prefix(dev),\
+   ##__VA_ARGS__)
+#define INFO(dev, fmt, ...) pr_info("ibnbd <%s@%s>: " \
+   fmt, dev->pathname, ibnbd_prefix(dev), ##__VA_ARGS__)
+#define INFO_RL(dev, fmt, ...) pr_info_ratelimited("ibnbd <%s@%s>: " \
+   fmt, dev->pathname, ibnbd_prefix(dev), ##__VA_ARGS__)
+
+#endif /*__IBNBD_SRV_LOG_H__*/
-- 
2.7.4



[PATCH 22/28] ibnbd_srv: add main functionality

2017-03-24 Thread Jack Wang
From: Jack Wang 

Process incoming IO from ibtrs server, and hands them down to
underlying block device.

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/block/ibnbd_server/ibnbd_srv.c | 1074 
 1 file changed, 1074 insertions(+)
 create mode 100644 drivers/block/ibnbd_server/ibnbd_srv.c

diff --git a/drivers/block/ibnbd_server/ibnbd_srv.c 
b/drivers/block/ibnbd_server/ibnbd_srv.c
new file mode 100644
index 000..13832b6
--- /dev/null
+++ b/drivers/block/ibnbd_server/ibnbd_srv.c
@@ -0,0 +1,1074 @@
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include "../ibnbd_inc/ibnbd-proto.h"
+#include 
+#include "../ibnbd_inc/ibnbd.h"
+#include "ibnbd_srv.h"
+#include "ibnbd_srv_log.h"
+#include "ibnbd_srv_sysfs.h"
+#include "ibnbd_dev.h"
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_VERSION(__stringify(IBNBD_VER));
+MODULE_DESCRIPTION("InfiniBand Network Block Device Server");
+MODULE_LICENSE("GPL");
+
+#define DEFAULT_DEV_SEARCH_PATH "/"
+
+static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH;
+
+static int dev_search_path_set(const char *val, const struct kernel_param *kp)
+{
+   char *dup;
+
+   if (strlen(val) >= sizeof(dev_search_path))
+   return -EINVAL;
+
+   dup = kstrdup(val, GFP_KERNEL);
+
+   if (dup[strlen(dup) - 1] == '\n')
+   dup[strlen(dup) - 1] = '\0';
+
+   strlcpy(dev_search_path, dup, sizeof(dev_search_path));
+
+   kfree(dup);
+   INFO_NP("dev_search_path changed to '%s'\n", dev_search_path);
+
+   return 0;
+}
+
+static struct kparam_string dev_search_path_kparam_str = {
+   .maxlen = sizeof(dev_search_path),
+   .string = dev_search_path
+};
+
+static const struct kernel_param_ops dev_search_path_ops = {
+   .set= dev_search_path_set,
+   .get= param_get_string,
+};
+
+module_param_cb(dev_search_path, _search_path_ops,
+   _search_path_kparam_str, 0444);
+MODULE_PARM_DESC(dev_search_path, "Sets the device_search_path."
+" When a device is mapped this path is prepended to the"
+" device_path from the map_device operation."
+" (default: " DEFAULT_DEV_SEARCH_PATH ")");
+
+static int def_io_mode = IBNBD_BLOCKIO;
+module_param(def_io_mode, int, 0444);
+MODULE_PARM_DESC(def_io_mode, "By default, export devices in"
+" blockio(" __stringify(_IBNBD_BLOCKIO) ") or"
+" fileio(" __stringify(_IBNBD_FILEIO) ") mode."
+   

[PATCH 09/28] ibtrs_srv: add header file for exported interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 include/rdma/ibtrs_srv.h | 206 +++
 1 file changed, 206 insertions(+)
 create mode 100644 include/rdma/ibtrs_srv.h

diff --git a/include/rdma/ibtrs_srv.h b/include/rdma/ibtrs_srv.h
new file mode 100644
index 000..dbd535f
--- /dev/null
+++ b/include/rdma/ibtrs_srv.h
@@ -0,0 +1,206 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef _IBTRS_SRV_H
+#define _IBTRS_SRV_H
+
+#include 
+
+struct ibtrs_session;
+struct ibtrs_ops_id;
+
+enum ibtrs_srv_rdma_ev {
+   IBTRS_SRV_RDMA_EV_RECV,
+   IBTRS_SRV_RDMA_EV_WRITE_REQ,
+};
+
+/**
+ * enum ibtrs_srv_sess_ev - Session events
+ * @IBTRS_SRV_SESS_EV_CONNECTED:   Connection from client established
+ * @IBTRS_SRV_SESS_EV_DISCONNECTING:   Connection is currently disconnected,
+ * sending data through the connection may
+ * fail, but could still recv messages.
+ * @IBTRS_SRV_SESS_EV_DISCONNECTED:Connection was disconnected, all
+ * connection IBTRS resources were freed.
+ */
+
+enum ibtrs_srv_sess_ev {
+   IBTRS_SRV_SESS_EV_CONNECTED,
+   IBTRS_SRV_SESS_EV_DISCONNECTING,
+   IBTRS_SRV_SESS_EV_DISCONNECTED,
+};
+
+/**
+ *  ibtrs_srv_ops - Callbacks for ibtrs_server
+ * @owner: module that uses ibtrs_server
+ * @rdma_ev:   Event notification for RDMA operations
+ * If the callback returns a value != 0, an error message
+ * for the data transfer will be sent to the client.
+
+ * @sess:  Session
+ * @priv:  Private data from user
+ * @id:internal IBTRS id
+ * @ev:Event
+ * @data:  Data received by the client. The message of the user of
+ * ibtrs_client is allocated at the end of the buffer.
+ * Before the message the data of the ibtrs_client is
+ * located.
+ * If the event is %IBTRS_SRV_RDMA_EV_WRITE_REQ, the user
+ * can write his response into @data. When
+ * ibtrs_srv_resp_rdma() is called, this @data will be
+ * transferred to the client.
+ * @len:   length of data in @data
+
+ * @sess_ev:   Events about connective state changes
+ * If the callback returns != 0 and the event
+ * 

[PATCH 10/28] ibtrs_srv: add main functionality for ibtrs_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Service accept connection requests from clients and reserve memory
for them.

It excutes rdma transfers, hands over received data to ibnbd_server.

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c | 3744 +++
 1 file changed, 3744 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c

diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c 
b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c
new file mode 100644
index 000..513e90a
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv.c
@@ -0,0 +1,3744 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include "ibtrs_srv_sysfs.h"
+#include "ibtrs_srv_internal.h"
+#include 
+#include 
+
+MODULE_AUTHOR("ib...@profitbricks.com");
+MODULE_DESCRIPTION("InfiniBand Transport Server");
+MODULE_VERSION(__stringify(IBTRS_VER));
+MODULE_LICENSE("GPL");
+
+#define DEFAULT_MAX_IO_SIZE_KB 128
+#define DEFAULT_MAX_IO_SIZE (DEFAULT_MAX_IO_SIZE_KB * 1024)
+static int max_io_size = DEFAULT_MAX_IO_SIZE;
+#define MAX_REQ_SIZE PAGE_SIZE
+static int rcv_buf_size = DEFAULT_MAX_IO_SIZE + MAX_REQ_SIZE;
+
+static int max_io_size_set(const char *val, const struct kernel_param *kp)
+{
+   int err, ival;
+
+   err = kstrtoint(val, 0, );
+   if (err)
+   return err;
+
+   if (ival < 4096 || ival + MAX_REQ_SIZE > (4096 * 1024) ||
+   (ival + MAX_REQ_SIZE) % 512 != 0) {
+   ERR_NP("Invalid max io size value %d, has to be"
+  " > %d, < %d\n", ival, 4096, 4194304);
+   return -EINVAL;
+   }
+
+   max_io_size = ival;
+   rcv_buf_size = max_io_size + MAX_REQ_SIZE;
+   INFO_NP("max io size changed to %d\n", ival);
+
+   return 0;
+}
+
+static const struct kernel_param_ops max_io_size_ops = {
+   .set= max_io_size_set,
+   .get= param_get_int,
+};
+module_param_cb(max_io_size, _io_size_ops, _io_size, 0444);
+MODULE_PARM_DESC(max_io_size,
+"Max size for each IO request, when change the unit is in byte"
+" (default: " __stringify(DEFAULT_MAX_IO_SIZE_KB) "KB)");
+
+#define DEFAULT_SESS_QUEUE_DEPTH 512
+static int sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH;
+module_param_named(sess_queue_depth, sess_queue_depth, int, 0444);
+MODULE_PARM_DESC(sess_queue_depth,
+"Number of buffers for 

[PATCH 08/28] ibtrs_clt: add Makefile and Kconfig

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
---
 drivers/infiniband/Kconfig   | 2 ++
 drivers/infiniband/ulp/Makefile  | 1 +
 drivers/infiniband/ulp/ibtrs_client/Kconfig  | 8 
 drivers/infiniband/ulp/ibtrs_client/Makefile | 6 ++
 4 files changed, 17 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_client/Kconfig
 create mode 100644 drivers/infiniband/ulp/ibtrs_client/Makefile

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 66f8602..cb1b864 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -85,6 +85,8 @@ source "drivers/infiniband/ulp/srpt/Kconfig"
 source "drivers/infiniband/ulp/iser/Kconfig"
 source "drivers/infiniband/ulp/isert/Kconfig"
 
+source "drivers/infiniband/ulp/ibtrs_client/Kconfig"
+
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
 
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
index f3c7dcf..acd8ce6 100644
--- a/drivers/infiniband/ulp/Makefile
+++ b/drivers/infiniband/ulp/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_INFINIBAND_SRP)+= srp/
 obj-$(CONFIG_INFINIBAND_SRPT)  += srpt/
 obj-$(CONFIG_INFINIBAND_ISER)  += iser/
 obj-$(CONFIG_INFINIBAND_ISERT) += isert/
+obj-$(CONFIG_INFINIBAND_IBTRS_CLT)  += ibtrs_client/
diff --git a/drivers/infiniband/ulp/ibtrs_client/Kconfig 
b/drivers/infiniband/ulp/ibtrs_client/Kconfig
new file mode 100644
index 000..3cf0728
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_client/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_IBTRS_CLT
+   tristate "InfiniBand IBTRS CLIENT"
+   depends on INFINIBAND_ADDR_TRANS
+   ---help---
+ Support for the simplified data transfer over InfiniBand.
+ This offer API to user module IBNBD_CLIENT
+
+ The IBTRS protocol is defined by the ProfitBricks GmbH.
diff --git a/drivers/infiniband/ulp/ibtrs_client/Makefile 
b/drivers/infiniband/ulp/ibtrs_client/Makefile
new file mode 100644
index 000..d0fb226
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_client/Makefile
@@ -0,0 +1,6 @@
+
+obj-$(CONFIG_INFINIBAND_IBTRS_CLT) += ibtrs_client.o
+
+ibtrs_client-y := ibtrs_clt.o ibtrs_clt_sysfs.o \
+   ../ibtrs_lib/ibtrs.o ../ibtrs_lib/ibtrs-proto.o ../ibtrs_lib/iu.o \
+   ../ibtrs_lib/heartbeat.o ../ibtrs_lib/common.o
-- 
2.7.4



[PATCH 12/28] ibtrs_srv: add sysfs interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 .../infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c  | 301 +
 .../infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.h  |  59 
 2 files changed, 360 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.h

diff --git a/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c 
b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c
new file mode 100644
index 000..c95a124
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_server/ibtrs_srv_sysfs.c
@@ -0,0 +1,301 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include "ibtrs_srv_sysfs.h"
+#include "ibtrs_srv_internal.h"
+#include 
+#include 
+#include 
+
+static struct kobject *ibtrs_srv_kobj;
+static struct kobject *ibtrs_srv_sessions_kobj;
+
+static ssize_t ibtrs_srv_hb_timeout_show(struct kobject *kobj,
+struct kobj_attribute *attr,
+char *page)
+{
+   struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session,
+ kobj);
+
+   return scnprintf(page, PAGE_SIZE, "%u\n", sess->heartbeat.timeout_ms);
+}
+
+static ssize_t ibtrs_srv_hb_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+   int ret;
+   u32 timeout_ms;
+   struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session,
+ kobj);
+
+   ret = kstrtouint(buf, 0, _ms);
+   if (ret)
+   return ret;
+
+   ret = ibtrs_heartbeat_timeout_validate(timeout_ms);
+   if (ret)
+   return ret;
+
+   INFO(sess, "%s: changing value from %u to %u\n", attr->attr.name,
+sess->heartbeat.timeout_ms, timeout_ms);
+   ibtrs_set_heartbeat_timeout(>heartbeat, timeout_ms);
+   return count;
+}
+
+static struct kobj_attribute ibtrs_srv_heartbeat_timeout_ms_attr =
+   __ATTR(heartbeat_timeout_ms, 0644,
+  ibtrs_srv_hb_timeout_show, ibtrs_srv_hb_timeout_store);
+
+static ssize_t ibtrs_srv_disconnect_show(struct kobject *kobj,
+struct kobj_attribute *attr,
+char *page)
+{
+   return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+  

[PATCH 07/28] ibtrs_clt: add files for sysfs interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 .../infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c  | 412 +
 .../infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.h  |  62 
 2 files changed, 474 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.h

diff --git a/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c 
b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c
new file mode 100644
index 000..d430af0
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_sysfs.c
@@ -0,0 +1,412 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include "ibtrs_clt_internal.h"
+#include 
+#include "ibtrs_clt_sysfs.h"
+#include 
+#include 
+#include 
+
+static struct kobject *sessions_kobj;
+static struct kobject *ibtrs_kobj;
+
+#define MIN_MAX_RECONN_ATT -1
+#define MAX_MAX_RECONN_ATT 
+
+static ssize_t ibtrs_clt_max_reconn_attempts_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *page)
+{
+   struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session,
+ kobj);
+
+   return sprintf(page, "%d\n",
+  ibtrs_clt_get_max_reconnect_attempts(sess));
+}
+
+static ssize_t ibtrs_clt_max_reconn_attempts_store(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  const char *buf,
+  size_t count)
+{
+   int ret;
+   s16 value;
+   struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session,
+ kobj);
+
+   ret = kstrtos16(buf, 10, );
+   if (unlikely(ret)) {
+   ERR(sess, "%s: failed to convert string '%s' to int\n",
+   attr->attr.name, buf);
+   return ret;
+   }
+   if (unlikely(value > MAX_MAX_RECONN_ATT ||
+value < MIN_MAX_RECONN_ATT)) {
+   ERR(sess, "%s: invalid range"
+   " (provided: '%s', accepted: min: %d, max: %d)\n",
+   attr->attr.name, buf, MIN_MAX_RECONN_ATT,
+   MAX_MAX_RECONN_ATT);
+   return -EINVAL;
+   }
+
+   INFO(sess, "%s: changing value from %d to %d\n", 

[PATCH 03/28] ibtrs_lib: add common functions shared by client and server

2017-03-24 Thread Jack Wang
From: Jack Wang 

These files define functions used by both client and server, eg
validate protocol message, heartbeat helpers, etc.

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 drivers/infiniband/ulp/ibtrs_lib/common.c  | 104 +++
 drivers/infiniband/ulp/ibtrs_lib/heartbeat.c   | 112 +++
 drivers/infiniband/ulp/ibtrs_lib/ibtrs-proto.c | 248 +++
 drivers/infiniband/ulp/ibtrs_lib/ibtrs.c   | 412 +
 drivers/infiniband/ulp/ibtrs_lib/iu.c  | 113 +++
 5 files changed, 989 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_lib/common.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_lib/heartbeat.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_lib/ibtrs-proto.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_lib/ibtrs.c
 create mode 100644 drivers/infiniband/ulp/ibtrs_lib/iu.c

diff --git a/drivers/infiniband/ulp/ibtrs_lib/common.c 
b/drivers/infiniband/ulp/ibtrs_lib/common.c
new file mode 100644
index 000..81affa7
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_lib/common.c
@@ -0,0 +1,104 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#include 
+#include 
+#include 
+
+u64 timediff_cur_ms(u64 cur_ms)
+{
+   struct timespec cur = CURRENT_TIME;
+   struct timespec ts = ns_to_timespec(cur_ms * NSEC_PER_MSEC);
+
+   if (timespec_compare(, ) < 0)
+   return timespec_to_ms() - timespec_to_ms();
+   else
+   return timespec_to_ms() - timespec_to_ms();
+}
+
+/*
+ * ibtrs_malloc() - allocate kernel or virtual memory
+ * @size: size to be allocated
+ *
+ * The pointer returned must be freed with kvfree()
+ */
+void *ibtrs_malloc(size_t size)
+{
+   void *p;
+
+   p = kmalloc(size, (GFP_KERNEL | __GFP_REPEAT));
+   if (p)
+   return p;
+
+   /* try allocating virtual memory */
+   p = vmalloc(size);
+   if (p)
+   return p;
+
+   return NULL;
+}
+
+/*
+ * ibtrs_zalloc() - allocate kernel or virtual memory
+ * @size: size to be allocated
+ *
+ * The pointer returned must be freed with kvfree()
+ */
+void *ibtrs_zalloc(size_t size)
+{
+   void *p;
+
+   p = kzalloc(size, GFP_KERNEL);
+   if (p)
+   return p;
+
+   /* try allocating virtual memory */
+   p = vzalloc(size);
+   if (p)
+   return p;
+
+   return NULL;
+}
diff --git a/drivers/infiniband/ulp/ibtrs_lib/heartbeat.c 
b/drivers/infiniband/ulp/ibtrs_lib/heartbeat.c
new file mode 100644
index 000..1575931
--- /dev/null

[PATCH 04/28] ibtrs_clt: add header file for exported interface

2017-03-24 Thread Jack Wang
From: Jack Wang 

User module eg ibnbd_client will use this interface to transfer
data later.

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 include/rdma/ibtrs_clt.h | 316 +++
 1 file changed, 316 insertions(+)
 create mode 100644 include/rdma/ibtrs_clt.h

diff --git a/include/rdma/ibtrs_clt.h b/include/rdma/ibtrs_clt.h
new file mode 100644
index 000..4fc9b12
--- /dev/null
+++ b/include/rdma/ibtrs_clt.h
@@ -0,0 +1,316 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#if !defined(IBTRS_CLIENT_H)
+#define IBTRS_CLIENT_H
+
+#include 
+
+struct ibtrs_session;
+
+/**
+ * ibtrs_clt_open() - Open a session to a ibtrs_server
+ * @addr: The IPv4, IPv6 or GID address of the peer
+ * @pdu_sz: Size of extra payload which can be accessed after tag allocation.
+ * @priv: Pointer passed back on _clt_ops->sess_ev() invocation
+ * @max_inflight_msg: Max. number of parallel inflight messages for the session
+ * @max_segments: Max. number of segments per IO request
+ * @reconnect_delay_sec: time between reconnect tries
+ * @max_reconnect_attempts: Number of times to reconnect on error before giving
+ * up, 0 for * disabled, -1 for forever
+ *
+ * Starts session establishment with the ibtrs_server. The function can block
+ * up to ~2000ms until it returns.
+ *
+ * Return a valid pointer on success otherwise PTR_ERR.
+ * -EINVAL:The provided addr could not be resolved to an Infiniband
+ * address, the route to the host could not be resolved or
+ * ibtrs_clt_register() was not called before.
+ */
+struct ibtrs_session *ibtrs_clt_open(const struct sockaddr_storage *addr,
+size_t pdu_sz, void *priv,
+u8 reconnect_delay_sec, u16 max_segments,
+s16 max_reconnect_attempts);
+
+/**
+ * ibtrs_clt_close() - Close a session
+ * @sess: Session handler, is freed on return
+ */
+int ibtrs_clt_close(struct ibtrs_session *sess);
+
+/**
+ * enum ibtrs_clt_rdma_ev - Events related to RDMA transfer operations
+ */
+enum ibtrs_clt_rdma_ev {
+   IBTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL,
+   IBTRS_CLT_RDMA_EV_RDMA_WRITE_COMPL,
+};
+
+/**
+ * enum ibtrs_sess_ev - Events about connectivity state of a session
+ * @IBTRS_CLT_SESS_EV_RECONNECTThe session was reconnected.
+ * @IBTRS_CLT_SESS_EV_DISCONNECTED The session was disconnected.
+ * @IBTRS_CLT_SESS_EV_MAX_RECONN_EXCEEDED Reconect attempts stopped 

[PATCH 06/28] ibtrs_clt: add header file shared only in ibtrs_client

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 .../ulp/ibtrs_client/ibtrs_clt_internal.h  | 244 +
 1 file changed, 244 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h

diff --git a/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h 
b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h
new file mode 100644
index 000..7274b2d
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs_client/ibtrs_clt_internal.h
@@ -0,0 +1,244 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#if !defined(IBTRS_CLT_INTERNAL_H)
+#define IBTRS_CLT_INTERNAL_H
+
+#include 
+
+enum ssm_state {
+   _SSM_STATE_MIN,
+   SSM_STATE_IDLE,
+   SSM_STATE_IDLE_RECONNECT,
+   SSM_STATE_WF_INFO,
+   SSM_STATE_WF_INFO_RECONNECT,
+   SSM_STATE_OPEN,
+   SSM_STATE_OPEN_RECONNECT,
+   SSM_STATE_CONNECTED,
+   SSM_STATE_RECONNECT,
+   SSM_STATE_RECONNECT_IMM,
+   SSM_STATE_CLOSE_DESTROY,
+   SSM_STATE_CLOSE_RECONNECT,
+   SSM_STATE_CLOSE_RECONNECT_IMM,
+   SSM_STATE_DISCONNECTED,
+   SSM_STATE_DESTROYED,
+   _SSM_STATE_MAX
+};
+
+enum ibtrs_fast_reg {
+   IBTRS_FAST_MEM_NONE,
+   IBTRS_FAST_MEM_FR,
+   IBTRS_FAST_MEM_FMR
+};
+
+struct ibtrs_stats_reconnects {
+   u32 successful_cnt;
+   u32 fail_cnt;
+};
+
+struct ibtrs_stats_wc_comp {
+   u32 max_wc_cnt;
+   u32 cnt;
+   u64 total_cnt;
+};
+
+struct ibtrs_stats_cpu_migration {
+   atomic_t *from;
+   int *to;
+};
+
+struct ibtrs_clt_stats_rdma_stats {
+   u64 cnt_read;
+   u64 size_total_read;
+   u64 cnt_write;
+   u64 size_total_write;
+
+   u16 inflight;
+};
+
+#define MIN_LOG_SG 2
+#define MAX_LOG_SG 5
+#define MAX_LIN_SG BIT(MIN_LOG_SG)
+#define SG_DISTR_LEN (MAX_LOG_SG - MIN_LOG_SG + MAX_LIN_SG + 1)
+
+struct ibtrs_clt_stats_rdma_lat_entry {
+   u64 read;
+   u64 write;
+};
+
+#define MAX_LOG_LATENCY16
+#define MIN_LOG_LATENCY0
+
+struct ibtrs_clt_stats_user_ib_msgs {
+   u32 recv_msg_cnt;
+   u32 sent_msg_cnt;
+   u64 recv_size;
+   u64 sent_size;
+};
+
+struct ibtrs_clt_stats {
+   struct ibtrs_stats_cpu_migrationcpu_migr;
+   struct ibtrs_clt_stats_rdma_stats   *rdma_stats;
+   u64 *sg_list_total;
+   u64 **sg_list_distr;
+   struct ibtrs_stats_reconnects   reconnects;
+   struct 

[PATCH 02/28] ibtrs: add header for log MICROs shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 include/rdma/ibtrs_log.h | 88 
 1 file changed, 88 insertions(+)
 create mode 100644 include/rdma/ibtrs_log.h

diff --git a/include/rdma/ibtrs_log.h b/include/rdma/ibtrs_log.h
new file mode 100644
index 000..28ff5b4
--- /dev/null
+++ b/include/rdma/ibtrs_log.h
@@ -0,0 +1,88 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler 
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBTRS_LOG_H__
+#define __IBTRS_LOG_H__
+#include "ibtrs.h"
+
+#define DEB(fmt, ...) pr_debug("ibtrs L%d " fmt, __LINE__, ##__VA_ARGS__)
+#define DEB_RL(fmt, ...) pr_debug_ratelimited("ibtrs L%d " fmt, \
+ __LINE__, ##__VA_ARGS__)
+static inline void ibtrs_deb_msg_hdr(const char *prep,
+const struct ibtrs_msg_hdr *hdr)
+{
+   DEB("%sibtrs msg hdr:\n"
+   "\ttype: %d\n"
+   "\ttsize: %d\n", prep, hdr->type, hdr->tsize);
+}
+
+#define ERR_NP(fmt, ...) pr_err("ibtrs L%d ERR: " fmt, \
+   __LINE__, ##__VA_ARGS__)
+
+#define WRN_NP(fmt, ...) pr_warn("ibtrs L%d WARN: " fmt, \
+   __LINE__, ##__VA_ARGS__)
+#define INFO_NP(fmt, ...)  pr_info("ibtrs: " fmt, ##__VA_ARGS__)
+
+#define INFO_NP_RL(fmt, ...) pr_info_ratelimited("ibtrs: " fmt, ##__VA_ARGS__)
+
+#define ibtrs_prefix(sess) ((sess->hostname[0] != '\0') ? sess->hostname : \
+ sess->addr)
+
+#define ERR(sess, fmt, ...) pr_err("ibtrs L%d <%s> ERR: " fmt, \
+   __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__)
+#define ERR_RL(sess, fmt, ...) pr_err_ratelimited("ibtrs L%d <%s> ERR: " fmt, \
+   __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__)
+
+#define WRN(sess, fmt, ...) pr_warn("ibtrs L%d <%s> WARN: " fmt, \
+   __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__)
+#define WRN_RL(sess, fmt, ...) pr_warn_ratelimited("ibtrs L%d <%s> WARN: " \
+   fmt, __LINE__, ibtrs_prefix(sess), ##__VA_ARGS__)
+
+#define INFO(sess, fmt, ...) pr_info("ibtrs <%s>: " fmt, \
+   ibtrs_prefix(sess), ##__VA_ARGS__)
+#define INFO_RL(sess, fmt, ...) pr_info_ratelimited("ibtrs <%s>: " fmt, \
+   ibtrs_prefix(sess), ##__VA_ARGS__)
+#endif /*__IBTRS_LOG_H__*/
-- 
2.7.4



[PATCH 01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

2017-03-24 Thread Jack Wang
From: Jack Wang 

Signed-off-by: Jack Wang 
Signed-off-by: Kleber Souza 
Signed-off-by: Danil Kipnis 
Signed-off-by: Roman Pen 
---
 include/rdma/ibtrs.h | 514 +++
 1 file changed, 514 insertions(+)
 create mode 100644 include/rdma/ibtrs.h

diff --git a/include/rdma/ibtrs.h b/include/rdma/ibtrs.h
new file mode 100644
index 000..4fc572b
--- /dev/null
+++ b/include/rdma/ibtrs.h
@@ -0,0 +1,514 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < m...@fholler.de>
+ *  Jack Wang 
+ * Kleber Souza 
+ * Danil Kipnis 
+ * Roman Pen 
+ *  Milind Dumbare 
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions, and the following disclaimer,
+ *without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *substantially similar to the "NO WARRANTY" disclaimer below
+ *("Disclaimer") and any redistribution must be conditioned upon
+ *including a substantially similar Disclaimer requirement for further
+ *binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *of any contributors may be used to endorse or promote products derived
+ *from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBTRS_H
+#define __IBTRS_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define IBTRS_SERVER_PORT 1234
+#define WC_ARRAY_SIZE 16
+#define IB_APM_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */
+
+#define USR_MSG_CNT 64
+#define USR_CON_BUF_SIZE (USR_MSG_CNT * 2) /* double bufs for ACK's */
+
+#define DEFAULT_HEARTBEAT_TIMEOUT_MS 2
+#define MIN_HEARTBEAT_TIMEOUT_MS 5000
+#define HEARTBEAT_INTV_MS 500
+#define HEARTBEAT_INTV_JIFFIES msecs_to_jiffies(HEARTBEAT_INTV_MS)
+
+#define MIN_RTR_CNT 1
+#define MAX_RTR_CNT 7
+
+/*
+ * With the current size of the tag allocated on the client, 4K is the maximum
+ * number of tags we can allocate. (see IBNBD-2321)
+ * This number is also used on the client to allocate the IU for the user
+ * connection to receive the RDMA addresses from the server.
+ */
+#define MAX_SESS_QUEUE_DEPTH 4096
+
+#define XX(a) case (a): return #a
+
+#define IBTRS_ADDRLEN sizeof("ipv6:[:::::::]")
+
+static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
+{
+   switch (opcode) {
+   XX(IB_WC_SEND);
+   XX(IB_WC_RDMA_WRITE);
+   XX(IB_WC_RDMA_READ);
+   XX(IB_WC_COMP_SWAP);
+   XX(IB_WC_FETCH_ADD);
+   /* recv-side); inbound completion */
+   XX(IB_WC_RECV);
+   XX(IB_WC_RECV_RDMA_WITH_IMM);
+   default: return "IB_WC_OPCODE_UNKNOWN";
+   }
+}
+
+
+struct ib_session {
+   struct ib_pd*pd;
+   struct ib_mr*mr;
+   struct ib_event_handler event_handler;
+};
+
+struct ibtrs_ib_path {
+   union ib_gidp_sgid;
+   union ib_gidp_dgid;
+};
+
+struct ib_con {
+   struct ib_qp*qp cacheline_aligned;
+   struct ib_cq*cq cacheline_aligned;
+   struct ib_send_wr   beacon;
+   struct rdma_cm_id   *cm_id;
+   struct ibtrs_ib_pathpri_path;
+   struct ibtrs_ib_path   cur_path;
+   char*addr;
+   char*hostname;
+};
+
+struct ibtrs_iu {
+  

[RFC PATCH 00/28] INFINIBAND NETWORK BLOCK DEVICE (IBNBD)

2017-03-24 Thread Jack Wang
From: Jack Wang 

This series introduces IBNBD/IBTRS kernel modules.

IBNBD (InfiniBand network block device) allows for an RDMA transfer of block IO
over InfiniBand network. The driver presents itself as a block device on client
side and transmits the block requests in a zero-copy fashion to the server-side
via InfiniBand. The server part of the driver converts the incoming buffers back
into BIOs and hands them down to the underlying block device. As soon as IO
responses come back from the drive, they are being transmitted back to the
client.

We design and implement this solution based on our need for Cloud Computing,
the key features are:
- High throughput and low latency due to:
1) Only two rdma messages per IO
2) Simplified client side server memory management
3) Eliminated SCSI sublayer
- Simple configuration and handling
1) Server side is completely passive: volumes do not need to be
explicitly exported
2) Only IB port GID and device path needed on client side to map
a block device
3) A device can be remapped automatically i.e. after storage
reboot
- Pinning of IO-related processing to the CPU of the producer

For usage please refer to Documentation/IBNBD.txt in later patch.
My colleague Danil Kpnis presents IBNBD in Vault-2017 about our design/feature/
tradeoff/performance:

http://events.linuxfoundation.org/sites/events/files/slides/IBNBD-Vault-2017.pdf

The patchset is based on Linux 4.11-rc3. I've done functional tests with our
tests framework on AMD64 machines with Mellanox CX-2 and CX-3.

TODOs:
- move some helpers to core
- use new cq api, drain_cq etc
- support poll callback in MQ
- big endian machine support
- better files layout

We've learned a lot from other opensource project, namely SRP/SCST/LIO, etc,
thanks all the contributors. We hope our IBNBD bring more value to 
the opensource world.

A git tree is also avaiable at:
https://github.com/xjtuwjp/linux-2.6/commits/ibnbdv0

As usual, comments and reviews are welcome.

Jack Wang (28):
  ibtrs: add header shared between ibtrs_client and ibtrs_server
  ibtrs: add header for log MICROs shared between ibtrs_client and
ibtrs_server
  ibtrs_lib: add common functions shared by client and server
  ibtrs_clt: add header file for exported interface
  ibtrs_clt: main functionality of ibtrs_client
  ibtrs_clt: add header file shared only in ibtrs_client
  ibtrs_clt: add files for sysfs interface
  ibtrs_clt: add Makefile and Kconfig
  ibtrs_srv: add header file for exported interface
  ibtrs_srv: add main functionality for ibtrs_server
  ibtrs_srv: add header shared in ibtrs_server
  ibtrs_srv: add sysfs interface
  ibtrs_srv: add Makefile and Kconfig
  ibnbd: add headers shared by ibnbd_client and ibnbd_server
  ibnbd: add shared library functions
  ibnbd_clt: add main functionality of ibnbd_client
  ibnbd_clt: add header shared in ibnbd_client
  ibnbd_clt: add sysfs interface
  ibnbd_clt: add log helpers
  ibnbd_clt: add Makefile and Kconfig
  ibnbd_srv: add header shared in ibnbd_server
  ibnbd_srv: add main functionality
  ibnbd_srv: add abstraction for submit IO to file or block device
  ibnbd_srv: add log helpers
  ibnbd_srv: add sysfs interface
  ibnbd_srv: add Makefile and Kconfig
  ibnbd: add doc for how to use ibnbd and sysfs interface
  MAINTRAINERS: Add maintainer for IBNBD/IBTRS

 Documentation/IBNBD.txt|  284 ++
 MAINTAINERS|   14 +
 drivers/block/Kconfig  |3 +
 drivers/block/Makefile |2 +
 drivers/block/ibnbd_client/Kconfig |   16 +
 drivers/block/ibnbd_client/Makefile|5 +
 drivers/block/ibnbd_client/ibnbd_clt.c | 2007 
 drivers/block/ibnbd_client/ibnbd_clt.h |  231 +
 drivers/block/ibnbd_client/ibnbd_clt_log.h |   79 +
 drivers/block/ibnbd_client/ibnbd_clt_sysfs.c   |  863 
 drivers/block/ibnbd_client/ibnbd_clt_sysfs.h   |   64 +
 drivers/block/ibnbd_inc/ibnbd-proto.h  |  273 +
 drivers/block/ibnbd_inc/ibnbd.h|   55 +
 drivers/block/ibnbd_inc/log.h  |   68 +
 drivers/block/ibnbd_lib/ibnbd-proto.c  |  244 +
 drivers/block/ibnbd_lib/ibnbd.c|  108 +
 drivers/block/ibnbd_server/Kconfig |   16 +
 drivers/block/ibnbd_server/Makefile|3 +
 drivers/block/ibnbd_server/ibnbd_dev.c |  436 ++
 drivers/block/ibnbd_server/ibnbd_dev.h |  149 +
 drivers/block/ibnbd_server/ibnbd_srv.c | 1074 
 drivers/block/ibnbd_server/ibnbd_srv.h |  115 +
 drivers/block/ibnbd_server/ibnbd_srv_log.h |   69 +
 drivers/block/ibnbd_server/ibnbd_srv_sysfs.c   |  317 ++
 drivers/block/ibnbd_server/ibnbd_srv_sysfs.h   |   64 +
 drivers/infiniband/Kconfig |3 +
 drivers/infiniband/ulp/Makefile

Re: [PATCH v3] block: trace completion of all bios.

2017-03-24 Thread Ming Lei
On Fri, Mar 24, 2017 at 8:07 AM, NeilBrown  wrote:
>
> Currently only dm and md/raid5 bios trigger
> trace_block_bio_complete().  Now that we have bio_chain() and
> bio_inc_remaining(), it is not possible, in general, for a driver to
> know when the bio is really complete.  Only bio_endio() knows that.
>
> So move the trace_block_bio_complete() call to bio_endio().
>
> Now trace_block_bio_complete() pairs with trace_block_bio_queue().
> Any bio for which a 'queue' event is traced, will subsequently
> generate a 'complete' event.
>
> There are a few cases where completion tracing is not wanted.
> 1/ If blk_update_request() has already generated a completion
>trace event at the 'request' level, there is no point generating
>one at the bio level too.  In this case the bi_sector and bi_size
>will have changed, so the bio level event would be wrong
>
> 2/ If the bio hasn't actually been queued yet, but is being aborted
>early, then a trace event could be confusing.  Some filesystems
>call bio_endio() but do not want tracing.
>
> 3/ The bio_integrity code interposes itself by replacing bi_end_io,
>then restoring it and calling bio_endio() again.  This would produce
>two identical trace events if left like that.
>
> To handle these, we introduce a flag BIO_TRACE_COMPLETION and only
> produce the trace event when this is set.
> We address point 1 above by clearing the flag in blk_update_request().
> We address point 2 above by only setting the flag when
> generic_make_request() is called.
> We address point 3 above by clearing the flag after generating a
> completion event.
>
> When bio_split() is used on a bio, particularly in blk_queue_split(),
> there is an extra complication.  A new bio is split off the front, and
> may be handle directly without going through generic_make_request().
> The old bio, which has been advanced, is passed to
> generic_make_request(), so it will trigger a trace event a second
> time.
> Probably the best result when a split happens is to see a single
> 'queue' event for the whole bio, then multiple 'complete' events - one
> for each component.  To achieve this was can:
> - copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split()
> - avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set.
> This way, the split-off bio won't create a queue event, the original
> won't either even if it re-submitted to generic_make_request(),
> but both will produce completion events, each for their own range.
>
> So if generic_make_request() is called (which generates a QUEUED
> event), then bi_endio() will create a single COMPLETE event for each
> range that the bio is split into, unless the driver has explicitly
> requested it not to.
>
> Signed-off-by: NeilBrown 
> ---
>  block/bio.c   | 13 +
>  block/blk-core.c  | 10 +-
>  drivers/md/dm.c   |  1 -
>  drivers/md/raid5.c|  8 
>  include/linux/blk_types.h |  4 +++-
>  5 files changed, 25 insertions(+), 11 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 5eec5e08417f..c1272986133e 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1818,6 +1818,11 @@ static inline bool bio_remaining_done(struct bio *bio)
>   *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
>   *   way to end I/O on a bio. No one should call bi_end_io() directly on a
>   *   bio unless they own it and thus know that it has an end_io function.
> + *
> + *   bio_endio() can be called several times on a bio that has been chained
> + *   using bio_chain().  The ->bi_end_io() function will only be call the
> + *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
> + *   generated if BIO_TRACE_COMPLETION is set.
>   **/
>  void bio_endio(struct bio *bio)
>  {
> @@ -1838,6 +1843,11 @@ void bio_endio(struct bio *bio)
> goto again;
> }
>
> +   if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
> +   trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
> +bio, bio->bi_error);
> +   bio_clear_flag(bio, BIO_TRACE_COMPLETION);
> +   }
> if (bio->bi_end_io)
> bio->bi_end_io(bio);
>  }
> @@ -1876,6 +1886,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
>
> bio_advance(bio, split->bi_iter.bi_size);
>
> +   if (bio_flagged(bio, BIO_TRACE_COMPLETION))
> +   bio_set_flag(bio, BIO_TRACE_COMPLETION);
> +
> return split;
>  }
>  EXPORT_SYMBOL(bio_split);
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 0eeb99ef654f..b34b5b1b1bbf 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1936,7 +1936,13 @@ generic_make_request_checks(struct bio *bio)
> if (!blkcg_bio_issue_check(q, bio))
> return false;
>
> -   trace_block_bio_queue(q, bio);
> +   if (!bio_flagged(bio, 

Re: [PATCH v3 02/14] md: move two macros into md.h

2017-03-24 Thread Ming Lei
On Fri, Mar 24, 2017 at 1:57 PM, NeilBrown  wrote:
> On Fri, Mar 17 2017, Ming Lei wrote:
>
>> Both raid1 and raid10 share common resync
>> block size and page count, so move them into md.h.
>
> I don't think this is necessary.
> These are just "magic" numbers.  They don't have any real
> meaning and so don't belong in md.h, or and .h file.

The thing is that RESYNC_PAGES is needed in the following patch 3:

 [PATCH v3 03/14] md: prepare for managing resync I/O pages in clean way

so how about moving the macros into raid1-10.h? Cause you suggest
to create that header for holding the introduced helpers in patch3.

Thanks,
Ming


Re: [PATCH v3 03/14] md: prepare for managing resync I/O pages in clean way

2017-03-24 Thread NeilBrown
On Fri, Mar 17 2017, Ming Lei wrote:

> Now resync I/O use bio's bec table to manage pages,
> this way is very hacky, and may not work any more
> once multipage bvec is introduced.
>
> So introduce helpers and new data structure for
> managing resync I/O pages more cleanly.
>
> Signed-off-by: Ming Lei 
> ---
>  drivers/md/md.h | 50 ++
>  1 file changed, 50 insertions(+)

I don't think this should go in md.h

Maybe create a "raid1-10.h" or similar if you really want to.

NeilBrown

>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1d63239a1be4..20c48032493b 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -720,4 +720,54 @@ static inline void mddev_check_writesame(struct mddev 
> *mddev, struct bio *bio)
>  #define RESYNC_BLOCK_SIZE (64*1024)
>  #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
>  
> +/* for managing resync I/O pages */
> +struct resync_pages {
> + unsignedidx;/* for get/put page from the pool */
> + void*raid_bio;
> + struct page *pages[RESYNC_PAGES];
> +};
> +
> +static inline int resync_alloc_pages(struct resync_pages *rp,
> +  gfp_t gfp_flags)
> +{
> + int i;
> +
> + for (i = 0; i < RESYNC_PAGES; i++) {
> + rp->pages[i] = alloc_page(gfp_flags);
> + if (!rp->pages[i])
> + goto out_free;
> + }
> +
> + return 0;
> +
> + out_free:
> + while (--i >= 0)
> + put_page(rp->pages[i]);
> + return -ENOMEM;
> +}
> +
> +static inline void resync_free_pages(struct resync_pages *rp)
> +{
> + int i;
> +
> + for (i = 0; i < RESYNC_PAGES; i++)
> + put_page(rp->pages[i]);
> +}
> +
> +static inline void resync_get_all_pages(struct resync_pages *rp)
> +{
> + int i;
> +
> + for (i = 0; i < RESYNC_PAGES; i++)
> + get_page(rp->pages[i]);
> +}
> +
> +static inline struct page *resync_fetch_page(struct resync_pages *rp,
> +  unsigned idx)
> +{
> + if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
> + return NULL;
> + return rp->pages[idx];
> +}
> +
>  #endif /* _MD_MD_H */
> -- 
> 2.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: PGP signature