Re: krbd blk-mq support ?

2014-12-10 Thread Christoph Hellwig
On Thu, Nov 13, 2014 at 10:44:18AM +0100, Alexandre DERUMIER wrote:
 Did you manage to get those numbers?
 
 Not yet, I'll try next week.

What's the result?  I'd really like to get rid of old request drivers
as much as possible.
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-12-10 Thread Alexandre DERUMIER
Hi Christoph,

I have redone bench, but I think I don't have enough ios/osd.

I'm stuck around 12iops randread 4k with or without your patch.

(But I don't see any speed regression)

I'm going to have a bigger full ssd production cluster in the coming months,
So I'll redone tests when I'll be ready.

Regards,

Alexandre
- Mail original -
De: Christoph Hellwig h...@infradead.org
À: aderumier aderum...@odiso.com
Cc: ceph-devel ceph-devel@vger.kernel.org
Envoyé: Mercredi 10 Décembre 2014 15:05:18
Objet: Re: krbd blk-mq support ?

On Thu, Nov 13, 2014 at 10:44:18AM +0100, Alexandre DERUMIER wrote: 
 Did you manage to get those numbers? 
 
 Not yet, I'll try next week. 

What's the result? I'd really like to get rid of old request drivers 
as much as possible. 

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-11-12 Thread Christoph Hellwig
On Tue, Nov 04, 2014 at 08:19:32AM +0100, Alexandre DERUMIER wrote:
 Now : 3.18 kernel + your patch : 12 iops
   3.10 kernel : 8iops
 
 
 I'll try 3.18 kernel without your patch to compare.

Did you manage to get those numbers?
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-11-03 Thread Christoph Hellwig
Hi Alexandre,

can you try the patch below instead of the previous three patches?
This one uses a per-request work struct to allow for more concurrency.

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0a54c58..b981096 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
 #include linux/kernel.h
 #include linux/device.h
 #include linux/module.h
+#include linux/blk-mq.h
 #include linux/fs.h
 #include linux/blkdev.h
 #include linux/slab.h
@@ -343,7 +344,6 @@ struct rbd_device {
struct list_headrq_queue;   /* incoming rq queue */
spinlock_t  lock;   /* queue, flags, open_count */
struct workqueue_struct *rq_wq;
-   struct work_struct  rq_work;
 
struct rbd_image_header header;
unsigned long   flags;  /* possibly lock protected */
@@ -361,6 +361,9 @@ struct rbd_device {
atomic_tparent_ref;
struct rbd_device   *parent;
 
+   /* Block layer tags. */
+   struct blk_mq_tag_set   tag_set;
+
/* protects updating the header */
struct rw_semaphore header_rwsem;
 
@@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request 
*osd_req,
 
/*
 * We support a 64-bit length, but ultimately it has to be
-* passed to blk_end_request(), which takes an unsigned int.
+* passed to the block layer, which just supports a 32-bit
+* length field.
 */
obj_request-xferred = osd_req-r_reply_op_len[0];
rbd_assert(obj_request-xferred  (u64)UINT_MAX);
@@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct 
rbd_obj_request *obj_request)
more = obj_request-which  img_request-obj_request_count - 1;
} else {
rbd_assert(img_request-rq != NULL);
-   more = blk_end_request(img_request-rq, result, xferred);
+   
+   more = blk_update_request(img_request-rq, result, xferred);
+   if (!more)
+   __blk_mq_end_request(img_request-rq, result);
}
 
return more;
@@ -3305,8 +3312,10 @@ out:
return ret;
 }
 
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
 {
+   struct request *rq = blk_mq_rq_from_pdu(work);
+   struct rbd_device *rbd_dev = rq-q-queuedata;
struct rbd_img_request *img_request;
struct ceph_snap_context *snapc = NULL;
u64 offset = (u64)blk_rq_pos(rq)  SECTOR_SHIFT;
@@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq)
enum obj_operation_type op_type;
u64 mapping_size;
int result;
+   
+   if (rq-cmd_type != REQ_TYPE_FS) {
+   dout(%s: non-fs request type %d\n, __func__,
+   (int) rq-cmd_type);
+   result = -EIO;
+   goto err;
+   }
 
if (rq-cmd_flags  REQ_DISCARD)
op_type = OBJ_OP_DISCARD;
@@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq)
goto err_rq;
}
 
+   blk_mq_start_request(rq);
+
if (offset  length  U64_MAX - offset + 1) {
rbd_warn(rbd_dev, bad request range (%llu~%llu), offset,
 length);
@@ -3406,53 +3424,18 @@ err_rq:
 obj_op_name(op_type), length, offset, result);
if (snapc)
ceph_put_snap_context(snapc);
-   blk_end_request_all(rq, result);
+err:
+   blk_mq_end_request(rq, result);
 }
 
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
+   bool last)
 {
-   struct rbd_device *rbd_dev =
-   container_of(work, struct rbd_device, rq_work);
-   struct request *rq, *next;
-   LIST_HEAD(requests);
-
-   spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */
-   list_splice_init(rbd_dev-rq_queue, requests);
-   spin_unlock_irq(rbd_dev-lock);
-
-   list_for_each_entry_safe(rq, next, requests, queuelist) {
-   list_del_init(rq-queuelist);
-   rbd_handle_request(rbd_dev, rq);
-   }
-}
+   struct rbd_device *rbd_dev = rq-q-queuedata;
+   struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
-/*
- * Called with q-queue_lock held and interrupts disabled, possibly on
- * the way to schedule().  Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
-   struct rbd_device *rbd_dev = q-queuedata;
-   struct request *rq;
-   int queued = 0;
-
-   rbd_assert(rbd_dev);
-
-   while ((rq = blk_fetch_request(q))) {
-   /* Ignore any non-FS requests that filter through. */
-   if (rq-cmd_type != REQ_TYPE_FS) {
-   dout(%s: non-fs request 

Re: krbd blk-mq support ?

2014-11-03 Thread Alexandre DERUMIER
can you try the patch below instead of the previous three patches? 

Sure, I'll try tomorrow.


- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Lundi 3 Novembre 2014 12:08:07 
Objet: Re: krbd blk-mq support ? 

Hi Alexandre, 

can you try the patch below instead of the previous three patches? 
This one uses a per-request work struct to allow for more concurrency. 

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c 
index 0a54c58..b981096 100644 
--- a/drivers/block/rbd.c 
+++ b/drivers/block/rbd.c 
@@ -38,6 +38,7 @@ 
#include linux/kernel.h 
#include linux/device.h 
#include linux/module.h 
+#include linux/blk-mq.h 
#include linux/fs.h 
#include linux/blkdev.h 
#include linux/slab.h 
@@ -343,7 +344,6 @@ struct rbd_device { 
struct list_head rq_queue; /* incoming rq queue */ 
spinlock_t lock; /* queue, flags, open_count */ 
struct workqueue_struct *rq_wq; 
- struct work_struct rq_work; 

struct rbd_image_header header; 
unsigned long flags; /* possibly lock protected */ 
@@ -361,6 +361,9 @@ struct rbd_device { 
atomic_t parent_ref; 
struct rbd_device *parent; 

+ /* Block layer tags. */ 
+ struct blk_mq_tag_set tag_set; 
+ 
/* protects updating the header */ 
struct rw_semaphore header_rwsem; 

@@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request 
*osd_req, 

/* 
* We support a 64-bit length, but ultimately it has to be 
- * passed to blk_end_request(), which takes an unsigned int. 
+ * passed to the block layer, which just supports a 32-bit 
+ * length field. 
*/ 
obj_request-xferred = osd_req-r_reply_op_len[0]; 
rbd_assert(obj_request-xferred  (u64)UINT_MAX); 
@@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct 
rbd_obj_request *obj_request) 
more = obj_request-which  img_request-obj_request_count - 1; 
} else { 
rbd_assert(img_request-rq != NULL); 
- more = blk_end_request(img_request-rq, result, xferred); 
+ 
+ more = blk_update_request(img_request-rq, result, xferred); 
+ if (!more) 
+ __blk_mq_end_request(img_request-rq, result); 
} 

return more; 
@@ -3305,8 +3312,10 @@ out: 
return ret; 
} 

-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 
+static void rbd_queue_workfn(struct work_struct *work) 
{ 
+ struct request *rq = blk_mq_rq_from_pdu(work); 
+ struct rbd_device *rbd_dev = rq-q-queuedata; 
struct rbd_img_request *img_request; 
struct ceph_snap_context *snapc = NULL; 
u64 offset = (u64)blk_rq_pos(rq)  SECTOR_SHIFT; 
@@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq) 
enum obj_operation_type op_type; 
u64 mapping_size; 
int result; 
+ 
+ if (rq-cmd_type != REQ_TYPE_FS) { 
+ dout(%s: non-fs request type %d\n, __func__, 
+ (int) rq-cmd_type); 
+ result = -EIO; 
+ goto err; 
+ } 

if (rq-cmd_flags  REQ_DISCARD) 
op_type = OBJ_OP_DISCARD; 
@@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq) 
goto err_rq; 
} 

+ blk_mq_start_request(rq); 
+ 
if (offset  length  U64_MAX - offset + 1) { 
rbd_warn(rbd_dev, bad request range (%llu~%llu), offset, 
length); 
@@ -3406,53 +3424,18 @@ err_rq: 
obj_op_name(op_type), length, offset, result); 
if (snapc) 
ceph_put_snap_context(snapc); 
- blk_end_request_all(rq, result); 
+err: 
+ blk_mq_end_request(rq, result); 
} 

-static void rbd_request_workfn(struct work_struct *work) 
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, 
+ bool last) 
{ 
- struct rbd_device *rbd_dev = 
- container_of(work, struct rbd_device, rq_work); 
- struct request *rq, *next; 
- LIST_HEAD(requests); 
- 
- spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */ 
- list_splice_init(rbd_dev-rq_queue, requests); 
- spin_unlock_irq(rbd_dev-lock); 
- 
- list_for_each_entry_safe(rq, next, requests, queuelist) { 
- list_del_init(rq-queuelist); 
- rbd_handle_request(rbd_dev, rq); 
- } 
-} 
+ struct rbd_device *rbd_dev = rq-q-queuedata; 
+ struct work_struct *work = blk_mq_rq_to_pdu(rq); 

-/* 
- * Called with q-queue_lock held and interrupts disabled, possibly on 
- * the way to schedule(). Do not sleep here! 
- */ 
-static void rbd_request_fn(struct request_queue *q) 
-{ 
- struct rbd_device *rbd_dev = q-queuedata; 
- struct request *rq; 
- int queued = 0; 
- 
- rbd_assert(rbd_dev); 
- 
- while ((rq = blk_fetch_request(q))) { 
- /* Ignore any non-FS requests that filter through. */ 
- if (rq-cmd_type != REQ_TYPE_FS) { 
- dout(%s: non-fs request type %d\n, __func__, 
- (int) rq-cmd_type); 
- __blk_end_request_all(rq, 0); 
- continue; 
- } 
- 
- list_add_tail(rq-queuelist, rbd_dev-rq_queue); 
- queued++; 
- } 
- 
- if (queued) 
- queue_work(rbd_dev-rq_wq, rbd_dev-rq_work); 
+ queue_work(rbd_dev-rq_wq, work); 
+ return 0; 
} 

/* 
@@ -3513,6 +3496,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) 
del_gendisk(disk); 
if (disk-queue

Re: krbd blk-mq support ?

2014-11-03 Thread Alexandre DERUMIER
Hi again,

I have good news, I finally solved my problem !

Simply with installing irqbalance

#apt-get install irqbalance

So maybe the problem was at the nic/network level.



Now : 3.18 kernel + your patch : 12 iops
  3.10 kernel : 8iops


I'll try 3.18 kernel without your patch to compare.




  

- Mail original - 

De: Alexandre DERUMIER aderum...@odiso.com 
À: Christoph Hellwig h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Mardi 4 Novembre 2014 07:57:19 
Objet: Re: krbd blk-mq support ? 

Hi Christoph, 

I had tried your patch, but no improvement for my problem. 

I have always a kworker near 100% on 1core. 

I have finally be able to do perf on 3.18 kernel + your patch, I have attached 
the report in this mail. 



- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Lundi 3 Novembre 2014 12:08:07 
Objet: Re: krbd blk-mq support ? 

Hi Alexandre, 

can you try the patch below instead of the previous three patches? 
This one uses a per-request work struct to allow for more concurrency. 

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c 
index 0a54c58..b981096 100644 
--- a/drivers/block/rbd.c 
+++ b/drivers/block/rbd.c 
@@ -38,6 +38,7 @@ 
#include linux/kernel.h 
#include linux/device.h 
#include linux/module.h 
+#include linux/blk-mq.h 
#include linux/fs.h 
#include linux/blkdev.h 
#include linux/slab.h 
@@ -343,7 +344,6 @@ struct rbd_device { 
struct list_head rq_queue; /* incoming rq queue */ 
spinlock_t lock; /* queue, flags, open_count */ 
struct workqueue_struct *rq_wq; 
- struct work_struct rq_work; 

struct rbd_image_header header; 
unsigned long flags; /* possibly lock protected */ 
@@ -361,6 +361,9 @@ struct rbd_device { 
atomic_t parent_ref; 
struct rbd_device *parent; 

+ /* Block layer tags. */ 
+ struct blk_mq_tag_set tag_set; 
+ 
/* protects updating the header */ 
struct rw_semaphore header_rwsem; 

@@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request 
*osd_req, 

/* 
* We support a 64-bit length, but ultimately it has to be 
- * passed to blk_end_request(), which takes an unsigned int. 
+ * passed to the block layer, which just supports a 32-bit 
+ * length field. 
*/ 
obj_request-xferred = osd_req-r_reply_op_len[0]; 
rbd_assert(obj_request-xferred  (u64)UINT_MAX); 
@@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct 
rbd_obj_request *obj_request) 
more = obj_request-which  img_request-obj_request_count - 1; 
} else { 
rbd_assert(img_request-rq != NULL); 
- more = blk_end_request(img_request-rq, result, xferred); 
+ 
+ more = blk_update_request(img_request-rq, result, xferred); 
+ if (!more) 
+ __blk_mq_end_request(img_request-rq, result); 
} 

return more; 
@@ -3305,8 +3312,10 @@ out: 
return ret; 
} 

-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 
+static void rbd_queue_workfn(struct work_struct *work) 
{ 
+ struct request *rq = blk_mq_rq_from_pdu(work); 
+ struct rbd_device *rbd_dev = rq-q-queuedata; 
struct rbd_img_request *img_request; 
struct ceph_snap_context *snapc = NULL; 
u64 offset = (u64)blk_rq_pos(rq)  SECTOR_SHIFT; 
@@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq) 
enum obj_operation_type op_type; 
u64 mapping_size; 
int result; 
+ 
+ if (rq-cmd_type != REQ_TYPE_FS) { 
+ dout(%s: non-fs request type %d\n, __func__, 
+ (int) rq-cmd_type); 
+ result = -EIO; 
+ goto err; 
+ } 

if (rq-cmd_flags  REQ_DISCARD) 
op_type = OBJ_OP_DISCARD; 
@@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device 
*rbd_dev, struct request *rq) 
goto err_rq; 
} 

+ blk_mq_start_request(rq); 
+ 
if (offset  length  U64_MAX - offset + 1) { 
rbd_warn(rbd_dev, bad request range (%llu~%llu), offset, 
length); 
@@ -3406,53 +3424,18 @@ err_rq: 
obj_op_name(op_type), length, offset, result); 
if (snapc) 
ceph_put_snap_context(snapc); 
- blk_end_request_all(rq, result); 
+err: 
+ blk_mq_end_request(rq, result); 
} 

-static void rbd_request_workfn(struct work_struct *work) 
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, 
+ bool last) 
{ 
- struct rbd_device *rbd_dev = 
- container_of(work, struct rbd_device, rq_work); 
- struct request *rq, *next; 
- LIST_HEAD(requests); 
- 
- spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */ 
- list_splice_init(rbd_dev-rq_queue, requests); 
- spin_unlock_irq(rbd_dev-lock); 
- 
- list_for_each_entry_safe(rq, next, requests, queuelist) { 
- list_del_init(rq-queuelist); 
- rbd_handle_request(rbd_dev, rq); 
- } 
-} 
+ struct rbd_device *rbd_dev = rq-q-queuedata; 
+ struct work_struct *work = blk_mq_rq_to_pdu(rq); 

-/* 
- * Called with q-queue_lock held and interrupts disabled, possibly on 
- * the way to schedule(). Do not sleep here! 
- */ 
-static void rbd_request_fn(struct request_queue *q

Re: krbd blk-mq support ?

2014-10-31 Thread Alexandre DERUMIER
filestore_xattr_use_omap ? 

Oh, I think it's a mistake (Just copy paste some configs from sommath).
But I'm using xfs, so I don't think I need it






- Mail original - 

De: Stefan Priebe - Profihost AG s.pri...@profihost.ag 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Haomai Wang haomaiw...@gmail.com, Sage Weil s...@newdream.net, 
Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Vendredi 31 Octobre 2014 06:39:22 
Objet: Re: krbd blk-mq support ? 


Hi, 


why do you use 


filestore_xattr_use_omap ? 

Stefan 


Excuse my typo s ent from my mobile phone. 

Am 31.10.2014 um 06:04 schrieb Alexandre DERUMIER  aderum...@odiso.com : 



filestore_xattr_use_omap 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-30 Thread Alexandre DERUMIER
Hmm, this is probably the messenger.c worker then that is feeding messages 
to the network. How many OSDs do you have? It should be able to scale 
with the number of OSDs. 

Thanks Sage for your reply.

Currently 6 OSD (ssd) on the test platform.

But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents 
host.
Do you think messenger.c worker can be the bottleneck in this case ?


I'll try to add more OSD next week, if it's scale it's a very good news !







- Mail original - 

De: Sage Weil s...@newdream.net 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Mercredi 29 Octobre 2014 16:00:56 
Objet: Re: krbd blk-mq support ? 

On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: 
 Oh, that's without the blk-mq patch? 
 
 Yes, sorry, I don't how to use perf with a custom compiled kernel. 
 (Usualy I'm using perf from debian, with linux-tools package provided with 
 the debian kernel package) 
 
 Either way the profile doesn't really sum up to a fully used up cpu. 
 
 But I see mostly same behaviour with or without blk-mq patch, I have always 1 
 kworker at around 97-100%cpu (1core) for 5iops. 
 
 I had also tried to map the rbd volume with nocrc, it's going to 6iops 
 with same kworker at around 97-100%cpu 

Hmm, this is probably the messenger.c worker then that is feeding messages 
to the network. How many OSDs do you have? It should be able to scale 
with the number of OSDs. 

sage 


 
 
 
 - Mail original - 
 
 De: Christoph Hellwig h...@infradead.org 
 ?: Alexandre DERUMIER aderum...@odiso.com 
 Cc: Ceph Devel ceph-devel@vger.kernel.org 
 Envoy?: Mardi 28 Octobre 2014 19:07:25 
 Objet: Re: krbd blk-mq support ? 
 
 On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: 
  Can you do a perf report -ag and then a perf report to see where these 
  cycles are spent? 
  
  Yes, sure. 
  
  I have attached the perf report to this mail. 
  (This is with kernel 3.14, don't have access to my 3.18 host for now) 
 
 Oh, that's without the blk-mq patch? 
 
 Either way the profile doesn't really sum up to a fully used up 
 cpu. Sage, Alex - are there any ordring constraints in the rbd client? 
 If not we could probably aim for per-cpu queues using blk-mq and a 
 socket per cpu or similar. 
 -- 
 To unsubscribe from this list: send the line unsubscribe ceph-devel in 
 the body of a message to majord...@vger.kernel.org 
 More majordomo info at http://vger.kernel.org/majordomo-info.html 
 
 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-30 Thread Alexandre DERUMIER
I'll try to add more OSD next week, if it's scale it's a very good news !

I just tried to add 2 more osds,

I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously).

and kworker cpu usage is also lower (84% vs 97%).
(don't understand why exactly)

So, Thanks for help everybody !





- Mail original - 

De: Alexandre DERUMIER aderum...@odiso.com 
À: Sage Weil s...@newdream.net 
Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Jeudi 30 Octobre 2014 09:11:11 
Objet: Re: krbd blk-mq support ? 

Hmm, this is probably the messenger.c worker then that is feeding messages 
to the network. How many OSDs do you have? It should be able to scale 
with the number of OSDs. 

Thanks Sage for your reply. 

Currently 6 OSD (ssd) on the test platform. 

But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents 
host. 
Do you think messenger.c worker can be the bottleneck in this case ? 


I'll try to add more OSD next week, if it's scale it's a very good news ! 







- Mail original - 

De: Sage Weil s...@newdream.net 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
ceph-devel@vger.kernel.org 
Envoyé: Mercredi 29 Octobre 2014 16:00:56 
Objet: Re: krbd blk-mq support ? 

On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: 
 Oh, that's without the blk-mq patch? 
 
 Yes, sorry, I don't how to use perf with a custom compiled kernel. 
 (Usualy I'm using perf from debian, with linux-tools package provided with 
 the debian kernel package) 
 
 Either way the profile doesn't really sum up to a fully used up cpu. 
 
 But I see mostly same behaviour with or without blk-mq patch, I have always 1 
 kworker at around 97-100%cpu (1core) for 5iops. 
 
 I had also tried to map the rbd volume with nocrc, it's going to 6iops 
 with same kworker at around 97-100%cpu 

Hmm, this is probably the messenger.c worker then that is feeding messages 
to the network. How many OSDs do you have? It should be able to scale 
with the number of OSDs. 

sage 


 
 
 
 - Mail original - 
 
 De: Christoph Hellwig h...@infradead.org 
 ?: Alexandre DERUMIER aderum...@odiso.com 
 Cc: Ceph Devel ceph-devel@vger.kernel.org 
 Envoy?: Mardi 28 Octobre 2014 19:07:25 
 Objet: Re: krbd blk-mq support ? 
 
 On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: 
  Can you do a perf report -ag and then a perf report to see where these 
  cycles are spent? 
  
  Yes, sure. 
  
  I have attached the perf report to this mail. 
  (This is with kernel 3.14, don't have access to my 3.18 host for now) 
 
 Oh, that's without the blk-mq patch? 
 
 Either way the profile doesn't really sum up to a fully used up 
 cpu. Sage, Alex - are there any ordring constraints in the rbd client? 
 If not we could probably aim for per-cpu queues using blk-mq and a 
 socket per cpu or similar. 
 -- 
 To unsubscribe from this list: send the line unsubscribe ceph-devel in 
 the body of a message to majord...@vger.kernel.org 
 More majordomo info at http://vger.kernel.org/majordomo-info.html 
 
 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in 
the body of a message to majord...@vger.kernel.org 
More majordomo info at http://vger.kernel.org/majordomo-info.html 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-30 Thread Haomai Wang
Could you describe more about 2x7 iops?
So you mean 8 OSD each backend with SSD can achieve with 14w iops?
is it read or write? could you give fio options?

On Fri, Oct 31, 2014 at 12:01 AM, Alexandre DERUMIER
aderum...@odiso.com wrote:
I'll try to add more OSD next week, if it's scale it's a very good news !

 I just tried to add 2 more osds,

 I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously).

 and kworker cpu usage is also lower (84% vs 97%).
 (don't understand why exactly)

 So, Thanks for help everybody !





 - Mail original -

 De: Alexandre DERUMIER aderum...@odiso.com
 À: Sage Weil s...@newdream.net
 Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
 ceph-devel@vger.kernel.org
 Envoyé: Jeudi 30 Octobre 2014 09:11:11
 Objet: Re: krbd blk-mq support ?

Hmm, this is probably the messenger.c worker then that is feeding messages
to the network. How many OSDs do you have? It should be able to scale
with the number of OSDs.

 Thanks Sage for your reply.

 Currently 6 OSD (ssd) on the test platform.

 But I can reach 2x 5iops on same rbd volume with 2 clients on 2 
 differents host.
 Do you think messenger.c worker can be the bottleneck in this case ?


 I'll try to add more OSD next week, if it's scale it's a very good news !







 - Mail original -

 De: Sage Weil s...@newdream.net
 À: Alexandre DERUMIER aderum...@odiso.com
 Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
 ceph-devel@vger.kernel.org
 Envoyé: Mercredi 29 Octobre 2014 16:00:56
 Objet: Re: krbd blk-mq support ?

 On Wed, 29 Oct 2014, Alexandre DERUMIER wrote:
 Oh, that's without the blk-mq patch?

 Yes, sorry, I don't how to use perf with a custom compiled kernel.
 (Usualy I'm using perf from debian, with linux-tools package provided with 
 the debian kernel package)

 Either way the profile doesn't really sum up to a fully used up cpu.

 But I see mostly same behaviour with or without blk-mq patch, I have always 
 1 kworker at around 97-100%cpu (1core) for 5iops.

 I had also tried to map the rbd volume with nocrc, it's going to 6iops 
 with same kworker at around 97-100%cpu

 Hmm, this is probably the messenger.c worker then that is feeding messages
 to the network. How many OSDs do you have? It should be able to scale
 with the number of OSDs.

 sage





 - Mail original -

 De: Christoph Hellwig h...@infradead.org
 ?: Alexandre DERUMIER aderum...@odiso.com
 Cc: Ceph Devel ceph-devel@vger.kernel.org
 Envoy?: Mardi 28 Octobre 2014 19:07:25
 Objet: Re: krbd blk-mq support ?

 On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote:
  Can you do a perf report -ag and then a perf report to see where these
  cycles are spent?
 
  Yes, sure.
 
  I have attached the perf report to this mail.
  (This is with kernel 3.14, don't have access to my 3.18 host for now)

 Oh, that's without the blk-mq patch?

 Either way the profile doesn't really sum up to a fully used up
 cpu. Sage, Alex - are there any ordring constraints in the rbd client?
 If not we could probably aim for per-cpu queues using blk-mq and a
 socket per cpu or similar.
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at http://vger.kernel.org/majordomo-info.html


 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at http://vger.kernel.org/majordomo-info.html
 --
 To unsubscribe from this list: send the line unsubscribe ceph-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Best Regards,

Wheat
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-30 Thread Alexandre DERUMIER
Could you describe more about 2x7 iops?
So you mean 8 OSD each backend with SSD can achieve with 14w iops?

It's a small rbd (10G), so mostly read hit the buffer cache.
But yes, it's able to deliver 14iops with 8 osd. (I check also stats in 
ceph cluster to be sure).
(and I'm not cpu bound on osd nodes)
 2014-10-31 05:58:34.231037 mon.0 [INF] pgmap v7109: 1264 pgs: 1264 
 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 560 MB/s 
 rd, 140 kop/s


here the ceph.conf of osd nodes

[global]
fsid = c29f4643-9577-4671-ae25-59ad14550aba
auth_cluster_required = none
auth_service_required = none
auth_client_required = none
filestore_xattr_use_omap = true

   debug lockdep = 0/0
debug context = 0/0
debug crush = 0/0
debug buffer = 0/0
debug timer = 0/0
debug journaler = 0/0
debug osd = 0/0
debug optracker = 0/0
debug objclass = 0/0
debug filestore = 0/0
debug journal = 0/0
debug ms = 0/0
debug monc = 0/0
debug tp = 0/0
debug auth = 0/0
debug finisher = 0/0
debug heartbeatmap = 0/0
debug perfcounter = 0/0
debug asok = 0/0
debug throttle = 0/0

osd_op_threads = 5
filestore_op_threads = 4


osd_op_num_threads_per_shard = 1
osd_op_num_shards = 25
filestore_fd_cache_size = 64
filestore_fd_cache_shards = 32
osd_enable_op_tracker = false

 

is it read or write? could you give fio options?
random read 4K

Here the fio config.

[global]
ioengine=aio
invalidate=1
rw=randread
bs=4K
direct=1
numjobs=1
group_reporting=1
size=10G

[test1]
iodepth=64
filename=/dev/rbd/test/test


On 1 client node, I can't reach more than 5iops with 6osd or 7iops with 
8 osd.
(I had try to increasing numjobs to have more fio process or with 2 differents 
rbd volume at the same time, 
 but performance is the same).

 2014-10-31 05:57:30.078348 mon.0 [INF] pgmap v7070: 1264 pgs: 1264 
 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 290 MB/s 
 rd, 74572 op/s


But If I launch same fio test on another client node, I can reach same 
7iops at the same time.


 2014-10-31 05:58:34.231037 mon.0 [INF] pgmap v7109: 1264 pgs: 1264 
 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 560 MB/s 
 rd, 140 kop/s


- Mail original - 

De: Haomai Wang haomaiw...@gmail.com 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Sage Weil s...@newdream.net, Christoph Hellwig h...@infradead.org, 
Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Jeudi 30 Octobre 2014 18:05:26 
Objet: Re: krbd blk-mq support ? 

Could you describe more about 2x7 iops? 
So you mean 8 OSD each backend with SSD can achieve with 14w iops? 
is it read or write? could you give fio options? 

On Fri, Oct 31, 2014 at 12:01 AM, Alexandre DERUMIER 
aderum...@odiso.com wrote: 
I'll try to add more OSD next week, if it's scale it's a very good news ! 
 
 I just tried to add 2 more osds, 
 
 I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously). 
 
 and kworker cpu usage is also lower (84% vs 97%). 
 (don't understand why exactly) 
 
 So, Thanks for help everybody ! 
 
 
 
 
 
 - Mail original - 
 
 De: Alexandre DERUMIER aderum...@odiso.com 
 À: Sage Weil s...@newdream.net 
 Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
 ceph-devel@vger.kernel.org 
 Envoyé: Jeudi 30 Octobre 2014 09:11:11 
 Objet: Re: krbd blk-mq support ? 
 
Hmm, this is probably the messenger.c worker then that is feeding messages 
to the network. How many OSDs do you have? It should be able to scale 
with the number of OSDs. 
 
 Thanks Sage for your reply. 
 
 Currently 6 OSD (ssd) on the test platform. 
 
 But I can reach 2x 5iops on same rbd volume with 2 clients on 2 
 differents host. 
 Do you think messenger.c worker can be the bottleneck in this case ? 
 
 
 I'll try to add more OSD next week, if it's scale it's a very good news ! 
 
 
 
 
 
 
 
 - Mail original - 
 
 De: Sage Weil s...@newdream.net 
 À: Alexandre DERUMIER aderum...@odiso.com 
 Cc: Christoph Hellwig h...@infradead.org, Ceph Devel 
 ceph-devel@vger.kernel.org 
 Envoyé: Mercredi 29 Octobre 2014 16:00:56 
 Objet: Re: krbd blk-mq support ? 
 
 On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: 
 Oh, that's without the blk-mq patch? 
 
 Yes, sorry, I don't how to use perf with a custom compiled kernel. 
 (Usualy I'm using perf from debian, with linux-tools package provided with 
 the debian kernel package) 
 
 Either way the profile doesn't really sum up to a fully used up cpu. 
 
 But I see mostly same behaviour with or without blk-mq patch, I have always 
 1 kworker at around 97-100%cpu (1core) for 5iops. 
 
 I had also tried to map the rbd volume with nocrc, it's going to 6iops 
 with same kworker at around 97-100%cpu 
 
 Hmm, this is probably the messenger.c worker then that is feeding messages 
 to the network. How many

Re: krbd blk-mq support ?

2014-10-29 Thread Alexandre DERUMIER
Oh, that's without the blk-mq patch?

Yes, sorry, I don't how to use perf with a custom compiled kernel.
(Usualy I'm using perf from debian, with linux-tools package provided with the 
debian kernel package)

Either way the profile doesn't really sum up to a fully used up cpu.

But I see mostly same behaviour with or without blk-mq patch, I have always 1 
kworker at around 97-100%cpu (1core) for 5iops.

I had also tried to map the rbd volume with nocrc, it's going to 6iops with 
same kworker at around 97-100%cpu



- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Mardi 28 Octobre 2014 19:07:25 
Objet: Re: krbd blk-mq support ? 

On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: 
 Can you do a perf report -ag and then a perf report to see where these 
 cycles are spent? 
 
 Yes, sure. 
 
 I have attached the perf report to this mail. 
 (This is with kernel 3.14, don't have access to my 3.18 host for now) 

Oh, that's without the blk-mq patch? 

Either way the profile doesn't really sum up to a fully used up 
cpu. Sage, Alex - are there any ordring constraints in the rbd client? 
If not we could probably aim for per-cpu queues using blk-mq and a 
socket per cpu or similar. 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-28 Thread Christoph Hellwig
On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote:
 Can you do a perf report -ag and then a perf report to see where these
 cycles are spent?
 
 Yes, sure.
 
 I have attached the perf report to this mail.
 (This is with kernel 3.14, don't have access to my 3.18  host for now)

Oh, that's without the blk-mq patch?

Either way the profile doesn't really sum up to a fully used up
cpu.  Sage, Alex - are there any ordring constraints in the rbd client?
If not we could probably aim for per-cpu queues using blk-mq and a
socket per cpu or similar.
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-28 Thread Alex Elder

On 10/28/2014 01:07 PM, Christoph Hellwig wrote:

On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote:

Can you do a perf report -ag and then a perf report to see where these
cycles are spent?


Yes, sure.

I have attached the perf report to this mail.
(This is with kernel 3.14, don't have access to my 3.18  host for now)


Oh, that's without the blk-mq patch?

Either way the profile doesn't really sum up to a fully used up
cpu.  Sage, Alex - are there any ordring constraints in the rbd client?


I don't remember off hand.

In libceph I recall going to great lengths to retain the original
order of requests when they got re-sent after a connection reset.

I'll go look at the code a bit and see if I can refresh my memory
(though Sage may answer before I do).

-Alex


If not we could probably aim for per-cpu queues using blk-mq and a
socket per cpu or similar.
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-28 Thread Alex Elder

On 10/28/2014 01:07 PM, Christoph Hellwig wrote:

On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote:

Can you do a perf report -ag and then a perf report to see where these
cycles are spent?


Yes, sure.

I have attached the perf report to this mail.
(This is with kernel 3.14, don't have access to my 3.18  host for now)


Oh, that's without the blk-mq patch?

Either way the profile doesn't really sum up to a fully used up
cpu.  Sage, Alex - are there any ordring constraints in the rbd client?
If not we could probably aim for per-cpu queues using blk-mq and a
socket per cpu or similar.


First, a disclaimer--I haven't really been following this discussion
very closely.

For an rbd image request (which is what gets created from requests
from the block queue), the order of completion doesn't matter, and
although the object requests are submitted in order that shouldn't
be required either.

The image request is broken into one or more object requests (usually
just one) and they are treated as a unit.  When the last object request
of a set for an image request has completed, the image request is
treated as completed.

I hope that helps.  If not, ask again a different way...

-Alex


--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-27 Thread Alexandre DERUMIER
Have you tried mapping different images on the same m/c with 'noshare' map 
option ?

Oh, I didn't known about his option.

I found 1 reference here:

http://lists.ceph.com/pipermail/ceph-users-ceph.com/2013-September/034213.html

With noshare each mapped image will appear as a separate client instance, 
which means it will have it's own session with teh monitors and own TCP 
connections to the OSDs.  It may be a viable workaround for now but in 
general I would not recommend it.

So it should help with multiple rbd.
Do you known why Sage don't recommend it in this mail ?





- Mail original - 

De: Somnath Roy somnath@sandisk.com 
À: Alexandre DERUMIER aderum...@odiso.com, Christoph Hellwig 
h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Dimanche 26 Octobre 2014 20:08:42 
Objet: RE: krbd blk-mq support ? 

Alexandre, 
Have you tried mapping different images on the same m/c with 'noshare' map 
option ? 
If not, it will not scale with increasing number of images (and thus mapped 
rbds) on a single m/c as they will share the same connection to cluster. 

Thanks  Regards 
Somnath 

-Original Message- 
From: ceph-devel-ow...@vger.kernel.org 
[mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER 
Sent: Sunday, October 26, 2014 6:46 AM 
To: Christoph Hellwig 
Cc: Ceph Devel 
Subject: Re: krbd blk-mq support ? 

Hi, 

some news: 

I have applied patches succefully on top of 3.18-rc1 kernel. 

But don't seem to help is my case. 
(I think that blk-mq is working because I don't see any io schedulers on rbd 
devices, as blk-mq don't support them actually). 

My main problem is that I can't reach more than around 5iops on 1 machine, 

and the problem seem to be the kworker process stuck at 100% of 1core. 

I had tried multiple fio process, on differents rbd devices at the same time, 
and I'm always limited à 5iops. 

I'm sure that the ceph cluster is not the bottleneck, because if I launch 
another fio on another node at the same time, 

I can reach 5iops on each node, and both are limited by the kworker 
process. 


That's why I thinked that blk-mq could help, but it don't seem to be the case. 


Is this kworker cpu limitation a known bug ? 

Regards, 

Alexandre 

- Mail original - 

De: Alexandre DERUMIER aderum...@odiso.com 
À: Christoph Hellwig h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 14:27:47 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that 
I don't have a ceph test cluster available, so the conversion is 
untestested. 

Ok, Thanks ! I'll try them and see If I can improve qemu performance on a 
single drive with multiqueues. 

- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 12:55:01 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that I don't 
have a ceph test cluster available, so the conversion is untestested. 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at 
http://vger.kernel.org/majordomo-info.html 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at 
http://vger.kernel.org/majordomo-info.html 

 

PLEASE NOTE: The information contained in this electronic mail message is 
intended only for the use of the designated recipient(s) named above. If the 
reader of this message is not the intended recipient, you are hereby notified 
that you have received this message in error and that any review, 
dissemination, distribution, or copying of this message is strictly prohibited. 
If you have received this communication in error, please notify the sender by 
telephone or e-mail (as shown above) immediately and destroy any and all copies 
of this message in your possession (whether hard copies or electronically 
stored copies). 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-27 Thread Christoph Hellwig
On Sun, Oct 26, 2014 at 02:46:03PM +0100, Alexandre DERUMIER wrote:
 Hi,
 
 some news:
 
 I have applied patches succefully on top of 3.18-rc1 kernel.
 
 But don't seem to help is my case.
 (I think that blk-mq is working because I don't see any io schedulers on rbd 
 devices, as blk-mq don't support them actually).
 
 My main problem is that I can't reach more than around 5iops on 1 machine,
 
 and the problem seem to be the kworker process stuck at 100% of 1core.

Can you do a perf report -ag and then a perf report to see where these
cycles are spent?

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-27 Thread Alexandre DERUMIER
Hi Somnath,

I have just tried with 2 rbd volumes with (rbd map -o noshare rbdvolume -p 
pool) (kernel 3.14),
then a fio benchmark on both volumes at the same time
but I don't seem to help.

I have always the kworker process at 100%, and iops are 25000iops on each rbd 
volume.

- Mail original - 

De: Somnath Roy somnath@sandisk.com 
À: Alexandre DERUMIER aderum...@odiso.com, Christoph Hellwig 
h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Dimanche 26 Octobre 2014 20:08:42 
Objet: RE: krbd blk-mq support ? 

Alexandre, 
Have you tried mapping different images on the same m/c with 'noshare' map 
option ? 
If not, it will not scale with increasing number of images (and thus mapped 
rbds) on a single m/c as they will share the same connection to cluster. 

Thanks  Regards 
Somnath 

-Original Message- 
From: ceph-devel-ow...@vger.kernel.org 
[mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER 
Sent: Sunday, October 26, 2014 6:46 AM 
To: Christoph Hellwig 
Cc: Ceph Devel 
Subject: Re: krbd blk-mq support ? 

Hi, 

some news: 

I have applied patches succefully on top of 3.18-rc1 kernel. 

But don't seem to help is my case. 
(I think that blk-mq is working because I don't see any io schedulers on rbd 
devices, as blk-mq don't support them actually). 

My main problem is that I can't reach more than around 5iops on 1 machine, 

and the problem seem to be the kworker process stuck at 100% of 1core. 

I had tried multiple fio process, on differents rbd devices at the same time, 
and I'm always limited à 5iops. 

I'm sure that the ceph cluster is not the bottleneck, because if I launch 
another fio on another node at the same time, 

I can reach 5iops on each node, and both are limited by the kworker 
process. 


That's why I thinked that blk-mq could help, but it don't seem to be the case. 


Is this kworker cpu limitation a known bug ? 

Regards, 

Alexandre 

- Mail original - 

De: Alexandre DERUMIER aderum...@odiso.com 
À: Christoph Hellwig h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 14:27:47 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that 
I don't have a ceph test cluster available, so the conversion is 
untestested. 

Ok, Thanks ! I'll try them and see If I can improve qemu performance on a 
single drive with multiqueues. 

- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 12:55:01 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that I don't 
have a ceph test cluster available, so the conversion is untestested. 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at 
http://vger.kernel.org/majordomo-info.html 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at 
http://vger.kernel.org/majordomo-info.html 

 

PLEASE NOTE: The information contained in this electronic mail message is 
intended only for the use of the designated recipient(s) named above. If the 
reader of this message is not the intended recipient, you are hereby notified 
that you have received this message in error and that any review, 
dissemination, distribution, or copying of this message is strictly prohibited. 
If you have received this communication in error, please notify the sender by 
telephone or e-mail (as shown above) immediately and destroy any and all copies 
of this message in your possession (whether hard copies or electronically 
stored copies). 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-26 Thread Alexandre DERUMIER
Hi,

some news:

I have applied patches succefully on top of 3.18-rc1 kernel.

But don't seem to help is my case. 
(I think that blk-mq is working because I don't see any io schedulers on rbd 
devices, as blk-mq don't support them actually).

My main problem is that I can't reach more than around 5iops on 1 machine,

and the problem seem to be the kworker process stuck at 100% of 1core.

I had tried multiple fio process, on differents rbd devices at the same time, 
and I'm always limited à 5iops.

I'm sure that the ceph cluster is not the bottleneck, because if I launch 
another fio on another node at the same time,

I can reach 5iops on each node, and both are limited by the kworker process.


That's why I thinked that blk-mq could help, but it don't seem to be the case.


Is this kworker cpu limitation a known bug ?

Regards,

Alexandre

- Mail original - 

De: Alexandre DERUMIER aderum...@odiso.com 
À: Christoph Hellwig h...@infradead.org 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 14:27:47 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that 
I don't have a ceph test cluster available, so the conversion is 
untestested. 

Ok, Thanks ! I'll try them and see If I can improve qemu performance on a 
single drive with multiqueues. 

- Mail original - 

De: Christoph Hellwig h...@infradead.org 
À: Alexandre DERUMIER aderum...@odiso.com 
Cc: Ceph Devel ceph-devel@vger.kernel.org 
Envoyé: Vendredi 24 Octobre 2014 12:55:01 
Objet: Re: krbd blk-mq support ? 

If you're willing to experiment give the patches below a try, not that 
I don't have a ceph test cluster available, so the conversion is 
untestested. 
-- 
To unsubscribe from this list: send the line unsubscribe ceph-devel in 
the body of a message to majord...@vger.kernel.org 
More majordomo info at http://vger.kernel.org/majordomo-info.html 
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: krbd blk-mq support ?

2014-10-26 Thread Somnath Roy
Alexandre,
Have you tried mapping different images on the same m/c with 'noshare' map 
option ?
If not, it will not scale with increasing number of images (and thus mapped 
rbds) on a single m/c as they will share the same connection to cluster.

Thanks  Regards
Somnath

-Original Message-
From: ceph-devel-ow...@vger.kernel.org 
[mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER
Sent: Sunday, October 26, 2014 6:46 AM
To: Christoph Hellwig
Cc: Ceph Devel
Subject: Re: krbd blk-mq support ?

Hi,

some news:

I have applied patches succefully on top of 3.18-rc1 kernel.

But don't seem to help is my case.
(I think that blk-mq is working because I don't see any io schedulers on rbd 
devices, as blk-mq don't support them actually).

My main problem is that I can't reach more than around 5iops on 1 machine,

and the problem seem to be the kworker process stuck at 100% of 1core.

I had tried multiple fio process, on differents rbd devices at the same time, 
and I'm always limited à 5iops.

I'm sure that the ceph cluster is not the bottleneck, because if I launch 
another fio on another node at the same time,

I can reach 5iops on each node, and both are limited by the kworker process.


That's why I thinked that blk-mq could help, but it don't seem to be the case.


Is this kworker cpu limitation a known bug ?

Regards,

Alexandre

- Mail original -

De: Alexandre DERUMIER aderum...@odiso.com
À: Christoph Hellwig h...@infradead.org
Cc: Ceph Devel ceph-devel@vger.kernel.org
Envoyé: Vendredi 24 Octobre 2014 14:27:47
Objet: Re: krbd blk-mq support ?

If you're willing to experiment give the patches below a try, not that
I don't have a ceph test cluster available, so the conversion is
untestested.

Ok, Thanks ! I'll try them and see If I can improve qemu performance on a 
single drive with multiqueues.

- Mail original -

De: Christoph Hellwig h...@infradead.org
À: Alexandre DERUMIER aderum...@odiso.com
Cc: Ceph Devel ceph-devel@vger.kernel.org
Envoyé: Vendredi 24 Octobre 2014 12:55:01
Objet: Re: krbd blk-mq support ?

If you're willing to experiment give the patches below a try, not that I don't 
have a ceph test cluster available, so the conversion is untestested.
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at 
http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in the 
body of a message to majord...@vger.kernel.org More majordomo info at  
http://vger.kernel.org/majordomo-info.html



PLEASE NOTE: The information contained in this electronic mail message is 
intended only for the use of the designated recipient(s) named above. If the 
reader of this message is not the intended recipient, you are hereby notified 
that you have received this message in error and that any review, 
dissemination, distribution, or copying of this message is strictly prohibited. 
If you have received this communication in error, please notify the sender by 
telephone or e-mail (as shown above) immediately and destroy any and all copies 
of this message in your possession (whether hard copies or electronically 
stored copies).



krbd blk-mq support ?

2014-10-24 Thread Alexandre DERUMIER
Hi,

I would like to known if it's planned to add blk-mq (block multiqueue from 
kernel 3.17)  support to krbd ?

I think it could help single threaded workload (including qemu) to reach more 
iops.

I find some small discussion about it here:
http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/20584

But no news since then.

Regards,

Alexandre

--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-24 Thread Ilya Dryomov
On Fri, Oct 24, 2014 at 11:54 AM, Alexandre DERUMIER
aderum...@odiso.com wrote:
 Hi,

 I would like to known if it's planned to add blk-mq (block multiqueue from 
 kernel 3.17)  support to krbd ?

 I think it could help single threaded workload (including qemu) to reach more 
 iops.

 I find some small discussion about it here:
 http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/20584

 But no news since then.

There are no concrete plans as of now.  For 3.19 and 3.20 the main goal
is to get fancy striping (support for custom striping modes) in and
then get rid of kernel layering is EXPERIMENTAL! warning.

krbd is a network block device, so I don't think we will gain anything
significant in the performance department.  blk-mq was mentioned
because it lifts some of the implementation restrictions the current
infrastructure imposes on drivers.

Thanks,

Ilya
--
To unsubscribe from this list: send the line unsubscribe ceph-devel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: krbd blk-mq support ?

2014-10-24 Thread Christoph Hellwig
If you're willing to experiment give the patches below a try, not that
I don't have a ceph test cluster available, so the conversion is
untestested.

From 00668f00afc6f0cfbce05d1186116469c1f3f9b3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig h...@lst.de
Date: Fri, 24 Oct 2014 11:53:36 +0200
Subject: blk-mq: handle single queue case in blk_mq_hctx_next_cpu

Don't duplicate the code to handle the not cpu bounce case in the
caller, do it inside blk_mq_hctx_next_cpu instead.

Signed-off-by: Christoph Hellwig h...@lst.de
---
 block/blk-mq.c | 34 +-
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 68929ba..eaaedea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -760,10 +760,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx 
*hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
-   int cpu = hctx-next_cpu;
+   if (hctx-queue-nr_hw_queues == 1)
+   return WORK_CPU_UNBOUND;
 
if (--hctx-next_cpu_batch = 0) {
-   int next_cpu;
+   int cpu = hctx-next_cpu, next_cpu;
 
next_cpu = cpumask_next(hctx-next_cpu, hctx-cpumask);
if (next_cpu = nr_cpu_ids)
@@ -771,9 +772,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
hctx-next_cpu = next_cpu;
hctx-next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+   
+   return cpu;
}
 
-   return cpu;
+   return hctx-next_cpu;
 }
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
@@ -781,16 +784,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool 
async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, hctx-state)))
return;
 
-   if (!async  cpumask_test_cpu(smp_processor_id(), hctx-cpumask))
+   if (!async  cpumask_test_cpu(smp_processor_id(), hctx-cpumask)) {
__blk_mq_run_hw_queue(hctx);
-   else if (hctx-queue-nr_hw_queues == 1)
-   kblockd_schedule_delayed_work(hctx-run_work, 0);
-   else {
-   unsigned int cpu;
-
-   cpu = blk_mq_hctx_next_cpu(hctx);
-   kblockd_schedule_delayed_work_on(cpu, hctx-run_work, 0);
+   return;
}
+
+   kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+   hctx-run_work, 0);
 }
 
 void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -888,16 +888,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
 
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
-   unsigned long tmo = msecs_to_jiffies(msecs);
-
-   if (hctx-queue-nr_hw_queues == 1)
-   kblockd_schedule_delayed_work(hctx-delay_work, tmo);
-   else {
-   unsigned int cpu;
-
-   cpu = blk_mq_hctx_next_cpu(hctx);
-   kblockd_schedule_delayed_work_on(cpu, hctx-delay_work, tmo);
-   }
+   kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+   hctx-delay_work, msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
-- 
1.9.1

From 6002e20c4d2b150fcbe82a7bc45c90d30cb61b78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig h...@lst.de
Date: Fri, 24 Oct 2014 12:04:07 +0200
Subject: blk-mq: allow direct dispatch to a driver specific workqueue

We have various block drivers that need to execute long term blocking
operations during I/O submission like file system or network I/O.

Currently these drivers just queue up work to an internal workqueue
from their request_fn.  With blk-mq we can make sure they always get
called on their own workqueue directly for I/O submission by:

 1) adding a flag to prevent inline submission of I/O, and
 2) allowing the driver to pass in a workqueue in the tag_set that
will be used instead of kblockd.

Signed-off-by: Christoph Hellwig h...@lst.de
---
 block/blk-core.c   |  2 +-
 block/blk-mq.c | 12 +---
 block/blk.h|  1 +
 include/linux/blk-mq.h |  4 
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0421b53..7f7249f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep;
 /*
  * Controlling structure to kblockd
  */
-static struct workqueue_struct *kblockd_workqueue;
+struct workqueue_struct *kblockd_workqueue;
 
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index eaaedea..cea2f96 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -784,12 +784,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool 
async)
if (unlikely(test_bit(BLK_MQ_S_STOPPED, hctx-state)))
return;
 
-   if (!async  cpumask_test_cpu(smp_processor_id(), hctx-cpumask)) {
+   if (!async  !(hctx-flags  BLK_MQ_F_WORKQUEUE) 
+