Re: krbd blk-mq support ?
On Thu, Nov 13, 2014 at 10:44:18AM +0100, Alexandre DERUMIER wrote: Did you manage to get those numbers? Not yet, I'll try next week. What's the result? I'd really like to get rid of old request drivers as much as possible. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Hi Christoph, I have redone bench, but I think I don't have enough ios/osd. I'm stuck around 12iops randread 4k with or without your patch. (But I don't see any speed regression) I'm going to have a bigger full ssd production cluster in the coming months, So I'll redone tests when I'll be ready. Regards, Alexandre - Mail original - De: Christoph Hellwig h...@infradead.org À: aderumier aderum...@odiso.com Cc: ceph-devel ceph-devel@vger.kernel.org Envoyé: Mercredi 10 Décembre 2014 15:05:18 Objet: Re: krbd blk-mq support ? On Thu, Nov 13, 2014 at 10:44:18AM +0100, Alexandre DERUMIER wrote: Did you manage to get those numbers? Not yet, I'll try next week. What's the result? I'd really like to get rid of old request drivers as much as possible. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On Tue, Nov 04, 2014 at 08:19:32AM +0100, Alexandre DERUMIER wrote: Now : 3.18 kernel + your patch : 12 iops 3.10 kernel : 8iops I'll try 3.18 kernel without your patch to compare. Did you manage to get those numbers? -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Hi Alexandre, can you try the patch below instead of the previous three patches? This one uses a per-request work struct to allow for more concurrency. diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0a54c58..b981096 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -38,6 +38,7 @@ #include linux/kernel.h #include linux/device.h #include linux/module.h +#include linux/blk-mq.h #include linux/fs.h #include linux/blkdev.h #include linux/slab.h @@ -343,7 +344,6 @@ struct rbd_device { struct list_headrq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ struct workqueue_struct *rq_wq; - struct work_struct rq_work; struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ @@ -361,6 +361,9 @@ struct rbd_device { atomic_tparent_ref; struct rbd_device *parent; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, /* * We support a 64-bit length, but ultimately it has to be -* passed to blk_end_request(), which takes an unsigned int. +* passed to the block layer, which just supports a 32-bit +* length field. */ obj_request-xferred = osd_req-r_reply_op_len[0]; rbd_assert(obj_request-xferred (u64)UINT_MAX); @@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) more = obj_request-which img_request-obj_request_count - 1; } else { rbd_assert(img_request-rq != NULL); - more = blk_end_request(img_request-rq, result, xferred); + + more = blk_update_request(img_request-rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request-rq, result); } return more; @@ -3305,8 +3312,10 @@ out: return ret; } -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) +static void rbd_queue_workfn(struct work_struct *work) { + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq-q-queuedata; struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) SECTOR_SHIFT; @@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) enum obj_operation_type op_type; u64 mapping_size; int result; + + if (rq-cmd_type != REQ_TYPE_FS) { + dout(%s: non-fs request type %d\n, __func__, + (int) rq-cmd_type); + result = -EIO; + goto err; + } if (rq-cmd_flags REQ_DISCARD) op_type = OBJ_OP_DISCARD; @@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } + blk_mq_start_request(rq); + if (offset length U64_MAX - offset + 1) { rbd_warn(rbd_dev, bad request range (%llu~%llu), offset, length); @@ -3406,53 +3424,18 @@ err_rq: obj_op_name(op_type), length, offset, result); if (snapc) ceph_put_snap_context(snapc); - blk_end_request_all(rq, result); +err: + blk_mq_end_request(rq, result); } -static void rbd_request_workfn(struct work_struct *work) +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { - struct rbd_device *rbd_dev = - container_of(work, struct rbd_device, rq_work); - struct request *rq, *next; - LIST_HEAD(requests); - - spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */ - list_splice_init(rbd_dev-rq_queue, requests); - spin_unlock_irq(rbd_dev-lock); - - list_for_each_entry_safe(rq, next, requests, queuelist) { - list_del_init(rq-queuelist); - rbd_handle_request(rbd_dev, rq); - } -} + struct rbd_device *rbd_dev = rq-q-queuedata; + struct work_struct *work = blk_mq_rq_to_pdu(rq); -/* - * Called with q-queue_lock held and interrupts disabled, possibly on - * the way to schedule(). Do not sleep here! - */ -static void rbd_request_fn(struct request_queue *q) -{ - struct rbd_device *rbd_dev = q-queuedata; - struct request *rq; - int queued = 0; - - rbd_assert(rbd_dev); - - while ((rq = blk_fetch_request(q))) { - /* Ignore any non-FS requests that filter through. */ - if (rq-cmd_type != REQ_TYPE_FS) { - dout(%s: non-fs request
Re: krbd blk-mq support ?
can you try the patch below instead of the previous three patches? Sure, I'll try tomorrow. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Lundi 3 Novembre 2014 12:08:07 Objet: Re: krbd blk-mq support ? Hi Alexandre, can you try the patch below instead of the previous three patches? This one uses a per-request work struct to allow for more concurrency. diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0a54c58..b981096 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -38,6 +38,7 @@ #include linux/kernel.h #include linux/device.h #include linux/module.h +#include linux/blk-mq.h #include linux/fs.h #include linux/blkdev.h #include linux/slab.h @@ -343,7 +344,6 @@ struct rbd_device { struct list_head rq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ struct workqueue_struct *rq_wq; - struct work_struct rq_work; struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ @@ -361,6 +361,9 @@ struct rbd_device { atomic_t parent_ref; struct rbd_device *parent; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, /* * We support a 64-bit length, but ultimately it has to be - * passed to blk_end_request(), which takes an unsigned int. + * passed to the block layer, which just supports a 32-bit + * length field. */ obj_request-xferred = osd_req-r_reply_op_len[0]; rbd_assert(obj_request-xferred (u64)UINT_MAX); @@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) more = obj_request-which img_request-obj_request_count - 1; } else { rbd_assert(img_request-rq != NULL); - more = blk_end_request(img_request-rq, result, xferred); + + more = blk_update_request(img_request-rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request-rq, result); } return more; @@ -3305,8 +3312,10 @@ out: return ret; } -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) +static void rbd_queue_workfn(struct work_struct *work) { + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq-q-queuedata; struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) SECTOR_SHIFT; @@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) enum obj_operation_type op_type; u64 mapping_size; int result; + + if (rq-cmd_type != REQ_TYPE_FS) { + dout(%s: non-fs request type %d\n, __func__, + (int) rq-cmd_type); + result = -EIO; + goto err; + } if (rq-cmd_flags REQ_DISCARD) op_type = OBJ_OP_DISCARD; @@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } + blk_mq_start_request(rq); + if (offset length U64_MAX - offset + 1) { rbd_warn(rbd_dev, bad request range (%llu~%llu), offset, length); @@ -3406,53 +3424,18 @@ err_rq: obj_op_name(op_type), length, offset, result); if (snapc) ceph_put_snap_context(snapc); - blk_end_request_all(rq, result); +err: + blk_mq_end_request(rq, result); } -static void rbd_request_workfn(struct work_struct *work) +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { - struct rbd_device *rbd_dev = - container_of(work, struct rbd_device, rq_work); - struct request *rq, *next; - LIST_HEAD(requests); - - spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */ - list_splice_init(rbd_dev-rq_queue, requests); - spin_unlock_irq(rbd_dev-lock); - - list_for_each_entry_safe(rq, next, requests, queuelist) { - list_del_init(rq-queuelist); - rbd_handle_request(rbd_dev, rq); - } -} + struct rbd_device *rbd_dev = rq-q-queuedata; + struct work_struct *work = blk_mq_rq_to_pdu(rq); -/* - * Called with q-queue_lock held and interrupts disabled, possibly on - * the way to schedule(). Do not sleep here! - */ -static void rbd_request_fn(struct request_queue *q) -{ - struct rbd_device *rbd_dev = q-queuedata; - struct request *rq; - int queued = 0; - - rbd_assert(rbd_dev); - - while ((rq = blk_fetch_request(q))) { - /* Ignore any non-FS requests that filter through. */ - if (rq-cmd_type != REQ_TYPE_FS) { - dout(%s: non-fs request type %d\n, __func__, - (int) rq-cmd_type); - __blk_end_request_all(rq, 0); - continue; - } - - list_add_tail(rq-queuelist, rbd_dev-rq_queue); - queued++; - } - - if (queued) - queue_work(rbd_dev-rq_wq, rbd_dev-rq_work); + queue_work(rbd_dev-rq_wq, work); + return 0; } /* @@ -3513,6 +3496,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) del_gendisk(disk); if (disk-queue
Re: krbd blk-mq support ?
Hi again, I have good news, I finally solved my problem ! Simply with installing irqbalance #apt-get install irqbalance So maybe the problem was at the nic/network level. Now : 3.18 kernel + your patch : 12 iops 3.10 kernel : 8iops I'll try 3.18 kernel without your patch to compare. - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mardi 4 Novembre 2014 07:57:19 Objet: Re: krbd blk-mq support ? Hi Christoph, I had tried your patch, but no improvement for my problem. I have always a kworker near 100% on 1core. I have finally be able to do perf on 3.18 kernel + your patch, I have attached the report in this mail. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Lundi 3 Novembre 2014 12:08:07 Objet: Re: krbd blk-mq support ? Hi Alexandre, can you try the patch below instead of the previous three patches? This one uses a per-request work struct to allow for more concurrency. diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0a54c58..b981096 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -38,6 +38,7 @@ #include linux/kernel.h #include linux/device.h #include linux/module.h +#include linux/blk-mq.h #include linux/fs.h #include linux/blkdev.h #include linux/slab.h @@ -343,7 +344,6 @@ struct rbd_device { struct list_head rq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ struct workqueue_struct *rq_wq; - struct work_struct rq_work; struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ @@ -361,6 +361,9 @@ struct rbd_device { atomic_t parent_ref; struct rbd_device *parent; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -1816,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, /* * We support a 64-bit length, but ultimately it has to be - * passed to blk_end_request(), which takes an unsigned int. + * passed to the block layer, which just supports a 32-bit + * length field. */ obj_request-xferred = osd_req-r_reply_op_len[0]; rbd_assert(obj_request-xferred (u64)UINT_MAX); @@ -2280,7 +2284,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) more = obj_request-which img_request-obj_request_count - 1; } else { rbd_assert(img_request-rq != NULL); - more = blk_end_request(img_request-rq, result, xferred); + + more = blk_update_request(img_request-rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request-rq, result); } return more; @@ -3305,8 +3312,10 @@ out: return ret; } -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) +static void rbd_queue_workfn(struct work_struct *work) { + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq-q-queuedata; struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) SECTOR_SHIFT; @@ -3314,6 +3323,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) enum obj_operation_type op_type; u64 mapping_size; int result; + + if (rq-cmd_type != REQ_TYPE_FS) { + dout(%s: non-fs request type %d\n, __func__, + (int) rq-cmd_type); + result = -EIO; + goto err; + } if (rq-cmd_flags REQ_DISCARD) op_type = OBJ_OP_DISCARD; @@ -3353,6 +3369,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } + blk_mq_start_request(rq); + if (offset length U64_MAX - offset + 1) { rbd_warn(rbd_dev, bad request range (%llu~%llu), offset, length); @@ -3406,53 +3424,18 @@ err_rq: obj_op_name(op_type), length, offset, result); if (snapc) ceph_put_snap_context(snapc); - blk_end_request_all(rq, result); +err: + blk_mq_end_request(rq, result); } -static void rbd_request_workfn(struct work_struct *work) +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool last) { - struct rbd_device *rbd_dev = - container_of(work, struct rbd_device, rq_work); - struct request *rq, *next; - LIST_HEAD(requests); - - spin_lock_irq(rbd_dev-lock); /* rq-q-queue_lock */ - list_splice_init(rbd_dev-rq_queue, requests); - spin_unlock_irq(rbd_dev-lock); - - list_for_each_entry_safe(rq, next, requests, queuelist) { - list_del_init(rq-queuelist); - rbd_handle_request(rbd_dev, rq); - } -} + struct rbd_device *rbd_dev = rq-q-queuedata; + struct work_struct *work = blk_mq_rq_to_pdu(rq); -/* - * Called with q-queue_lock held and interrupts disabled, possibly on - * the way to schedule(). Do not sleep here! - */ -static void rbd_request_fn(struct request_queue *q
Re: krbd blk-mq support ?
filestore_xattr_use_omap ? Oh, I think it's a mistake (Just copy paste some configs from sommath). But I'm using xfs, so I don't think I need it - Mail original - De: Stefan Priebe - Profihost AG s.pri...@profihost.ag À: Alexandre DERUMIER aderum...@odiso.com Cc: Haomai Wang haomaiw...@gmail.com, Sage Weil s...@newdream.net, Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 31 Octobre 2014 06:39:22 Objet: Re: krbd blk-mq support ? Hi, why do you use filestore_xattr_use_omap ? Stefan Excuse my typo s ent from my mobile phone. Am 31.10.2014 um 06:04 schrieb Alexandre DERUMIER aderum...@odiso.com : filestore_xattr_use_omap -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. Thanks Sage for your reply. Currently 6 OSD (ssd) on the test platform. But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents host. Do you think messenger.c worker can be the bottleneck in this case ? I'll try to add more OSD next week, if it's scale it's a very good news ! - Mail original - De: Sage Weil s...@newdream.net À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mercredi 29 Octobre 2014 16:00:56 Objet: Re: krbd blk-mq support ? On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: Oh, that's without the blk-mq patch? Yes, sorry, I don't how to use perf with a custom compiled kernel. (Usualy I'm using perf from debian, with linux-tools package provided with the debian kernel package) Either way the profile doesn't really sum up to a fully used up cpu. But I see mostly same behaviour with or without blk-mq patch, I have always 1 kworker at around 97-100%cpu (1core) for 5iops. I had also tried to map the rbd volume with nocrc, it's going to 6iops with same kworker at around 97-100%cpu Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. sage - Mail original - De: Christoph Hellwig h...@infradead.org ?: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoy?: Mardi 28 Octobre 2014 19:07:25 Objet: Re: krbd blk-mq support ? On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
I'll try to add more OSD next week, if it's scale it's a very good news ! I just tried to add 2 more osds, I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously). and kworker cpu usage is also lower (84% vs 97%). (don't understand why exactly) So, Thanks for help everybody ! - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Sage Weil s...@newdream.net Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Jeudi 30 Octobre 2014 09:11:11 Objet: Re: krbd blk-mq support ? Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. Thanks Sage for your reply. Currently 6 OSD (ssd) on the test platform. But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents host. Do you think messenger.c worker can be the bottleneck in this case ? I'll try to add more OSD next week, if it's scale it's a very good news ! - Mail original - De: Sage Weil s...@newdream.net À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mercredi 29 Octobre 2014 16:00:56 Objet: Re: krbd blk-mq support ? On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: Oh, that's without the blk-mq patch? Yes, sorry, I don't how to use perf with a custom compiled kernel. (Usualy I'm using perf from debian, with linux-tools package provided with the debian kernel package) Either way the profile doesn't really sum up to a fully used up cpu. But I see mostly same behaviour with or without blk-mq patch, I have always 1 kworker at around 97-100%cpu (1core) for 5iops. I had also tried to map the rbd volume with nocrc, it's going to 6iops with same kworker at around 97-100%cpu Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. sage - Mail original - De: Christoph Hellwig h...@infradead.org ?: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoy?: Mardi 28 Octobre 2014 19:07:25 Objet: Re: krbd blk-mq support ? On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Could you describe more about 2x7 iops? So you mean 8 OSD each backend with SSD can achieve with 14w iops? is it read or write? could you give fio options? On Fri, Oct 31, 2014 at 12:01 AM, Alexandre DERUMIER aderum...@odiso.com wrote: I'll try to add more OSD next week, if it's scale it's a very good news ! I just tried to add 2 more osds, I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously). and kworker cpu usage is also lower (84% vs 97%). (don't understand why exactly) So, Thanks for help everybody ! - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Sage Weil s...@newdream.net Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Jeudi 30 Octobre 2014 09:11:11 Objet: Re: krbd blk-mq support ? Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. Thanks Sage for your reply. Currently 6 OSD (ssd) on the test platform. But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents host. Do you think messenger.c worker can be the bottleneck in this case ? I'll try to add more OSD next week, if it's scale it's a very good news ! - Mail original - De: Sage Weil s...@newdream.net À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mercredi 29 Octobre 2014 16:00:56 Objet: Re: krbd blk-mq support ? On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: Oh, that's without the blk-mq patch? Yes, sorry, I don't how to use perf with a custom compiled kernel. (Usualy I'm using perf from debian, with linux-tools package provided with the debian kernel package) Either way the profile doesn't really sum up to a fully used up cpu. But I see mostly same behaviour with or without blk-mq patch, I have always 1 kworker at around 97-100%cpu (1core) for 5iops. I had also tried to map the rbd volume with nocrc, it's going to 6iops with same kworker at around 97-100%cpu Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. sage - Mail original - De: Christoph Hellwig h...@infradead.org ?: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoy?: Mardi 28 Octobre 2014 19:07:25 Objet: Re: krbd blk-mq support ? On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Best Regards, Wheat -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Could you describe more about 2x7 iops? So you mean 8 OSD each backend with SSD can achieve with 14w iops? It's a small rbd (10G), so mostly read hit the buffer cache. But yes, it's able to deliver 14iops with 8 osd. (I check also stats in ceph cluster to be sure). (and I'm not cpu bound on osd nodes) 2014-10-31 05:58:34.231037 mon.0 [INF] pgmap v7109: 1264 pgs: 1264 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 560 MB/s rd, 140 kop/s here the ceph.conf of osd nodes [global] fsid = c29f4643-9577-4671-ae25-59ad14550aba auth_cluster_required = none auth_service_required = none auth_client_required = none filestore_xattr_use_omap = true debug lockdep = 0/0 debug context = 0/0 debug crush = 0/0 debug buffer = 0/0 debug timer = 0/0 debug journaler = 0/0 debug osd = 0/0 debug optracker = 0/0 debug objclass = 0/0 debug filestore = 0/0 debug journal = 0/0 debug ms = 0/0 debug monc = 0/0 debug tp = 0/0 debug auth = 0/0 debug finisher = 0/0 debug heartbeatmap = 0/0 debug perfcounter = 0/0 debug asok = 0/0 debug throttle = 0/0 osd_op_threads = 5 filestore_op_threads = 4 osd_op_num_threads_per_shard = 1 osd_op_num_shards = 25 filestore_fd_cache_size = 64 filestore_fd_cache_shards = 32 osd_enable_op_tracker = false is it read or write? could you give fio options? random read 4K Here the fio config. [global] ioengine=aio invalidate=1 rw=randread bs=4K direct=1 numjobs=1 group_reporting=1 size=10G [test1] iodepth=64 filename=/dev/rbd/test/test On 1 client node, I can't reach more than 5iops with 6osd or 7iops with 8 osd. (I had try to increasing numjobs to have more fio process or with 2 differents rbd volume at the same time, but performance is the same). 2014-10-31 05:57:30.078348 mon.0 [INF] pgmap v7070: 1264 pgs: 1264 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 290 MB/s rd, 74572 op/s But If I launch same fio test on another client node, I can reach same 7iops at the same time. 2014-10-31 05:58:34.231037 mon.0 [INF] pgmap v7109: 1264 pgs: 1264 active+clean; 165 GB data, 109 GB used, 6226 GB / 6335 GB avail; 560 MB/s rd, 140 kop/s - Mail original - De: Haomai Wang haomaiw...@gmail.com À: Alexandre DERUMIER aderum...@odiso.com Cc: Sage Weil s...@newdream.net, Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Jeudi 30 Octobre 2014 18:05:26 Objet: Re: krbd blk-mq support ? Could you describe more about 2x7 iops? So you mean 8 OSD each backend with SSD can achieve with 14w iops? is it read or write? could you give fio options? On Fri, Oct 31, 2014 at 12:01 AM, Alexandre DERUMIER aderum...@odiso.com wrote: I'll try to add more OSD next week, if it's scale it's a very good news ! I just tried to add 2 more osds, I can now reach 2x 7 iops on 2 client nodes (vs 2 x 5 previously). and kworker cpu usage is also lower (84% vs 97%). (don't understand why exactly) So, Thanks for help everybody ! - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Sage Weil s...@newdream.net Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Jeudi 30 Octobre 2014 09:11:11 Objet: Re: krbd blk-mq support ? Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many OSDs do you have? It should be able to scale with the number of OSDs. Thanks Sage for your reply. Currently 6 OSD (ssd) on the test platform. But I can reach 2x 5iops on same rbd volume with 2 clients on 2 differents host. Do you think messenger.c worker can be the bottleneck in this case ? I'll try to add more OSD next week, if it's scale it's a very good news ! - Mail original - De: Sage Weil s...@newdream.net À: Alexandre DERUMIER aderum...@odiso.com Cc: Christoph Hellwig h...@infradead.org, Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mercredi 29 Octobre 2014 16:00:56 Objet: Re: krbd blk-mq support ? On Wed, 29 Oct 2014, Alexandre DERUMIER wrote: Oh, that's without the blk-mq patch? Yes, sorry, I don't how to use perf with a custom compiled kernel. (Usualy I'm using perf from debian, with linux-tools package provided with the debian kernel package) Either way the profile doesn't really sum up to a fully used up cpu. But I see mostly same behaviour with or without blk-mq patch, I have always 1 kworker at around 97-100%cpu (1core) for 5iops. I had also tried to map the rbd volume with nocrc, it's going to 6iops with same kworker at around 97-100%cpu Hmm, this is probably the messenger.c worker then that is feeding messages to the network. How many
Re: krbd blk-mq support ?
Oh, that's without the blk-mq patch? Yes, sorry, I don't how to use perf with a custom compiled kernel. (Usualy I'm using perf from debian, with linux-tools package provided with the debian kernel package) Either way the profile doesn't really sum up to a fully used up cpu. But I see mostly same behaviour with or without blk-mq patch, I have always 1 kworker at around 97-100%cpu (1core) for 5iops. I had also tried to map the rbd volume with nocrc, it's going to 6iops with same kworker at around 97-100%cpu - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Mardi 28 Octobre 2014 19:07:25 Objet: Re: krbd blk-mq support ? On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On 10/28/2014 01:07 PM, Christoph Hellwig wrote: On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? I don't remember off hand. In libceph I recall going to great lengths to retain the original order of requests when they got re-sent after a connection reset. I'll go look at the code a bit and see if I can refresh my memory (though Sage may answer before I do). -Alex If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On 10/28/2014 01:07 PM, Christoph Hellwig wrote: On Mon, Oct 27, 2014 at 11:00:46AM +0100, Alexandre DERUMIER wrote: Can you do a perf report -ag and then a perf report to see where these cycles are spent? Yes, sure. I have attached the perf report to this mail. (This is with kernel 3.14, don't have access to my 3.18 host for now) Oh, that's without the blk-mq patch? Either way the profile doesn't really sum up to a fully used up cpu. Sage, Alex - are there any ordring constraints in the rbd client? If not we could probably aim for per-cpu queues using blk-mq and a socket per cpu or similar. First, a disclaimer--I haven't really been following this discussion very closely. For an rbd image request (which is what gets created from requests from the block queue), the order of completion doesn't matter, and although the object requests are submitted in order that shouldn't be required either. The image request is broken into one or more object requests (usually just one) and they are treated as a unit. When the last object request of a set for an image request has completed, the image request is treated as completed. I hope that helps. If not, ask again a different way... -Alex -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Have you tried mapping different images on the same m/c with 'noshare' map option ? Oh, I didn't known about his option. I found 1 reference here: http://lists.ceph.com/pipermail/ceph-users-ceph.com/2013-September/034213.html With noshare each mapped image will appear as a separate client instance, which means it will have it's own session with teh monitors and own TCP connections to the OSDs. It may be a viable workaround for now but in general I would not recommend it. So it should help with multiple rbd. Do you known why Sage don't recommend it in this mail ? - Mail original - De: Somnath Roy somnath@sandisk.com À: Alexandre DERUMIER aderum...@odiso.com, Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Dimanche 26 Octobre 2014 20:08:42 Objet: RE: krbd blk-mq support ? Alexandre, Have you tried mapping different images on the same m/c with 'noshare' map option ? If not, it will not scale with increasing number of images (and thus mapped rbds) on a single m/c as they will share the same connection to cluster. Thanks Regards Somnath -Original Message- From: ceph-devel-ow...@vger.kernel.org [mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER Sent: Sunday, October 26, 2014 6:46 AM To: Christoph Hellwig Cc: Ceph Devel Subject: Re: krbd blk-mq support ? Hi, some news: I have applied patches succefully on top of 3.18-rc1 kernel. But don't seem to help is my case. (I think that blk-mq is working because I don't see any io schedulers on rbd devices, as blk-mq don't support them actually). My main problem is that I can't reach more than around 5iops on 1 machine, and the problem seem to be the kworker process stuck at 100% of 1core. I had tried multiple fio process, on differents rbd devices at the same time, and I'm always limited à 5iops. I'm sure that the ceph cluster is not the bottleneck, because if I launch another fio on another node at the same time, I can reach 5iops on each node, and both are limited by the kworker process. That's why I thinked that blk-mq could help, but it don't seem to be the case. Is this kworker cpu limitation a known bug ? Regards, Alexandre - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 14:27:47 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. Ok, Thanks ! I'll try them and see If I can improve qemu performance on a single drive with multiqueues. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 12:55:01 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html PLEASE NOTE: The information contained in this electronic mail message is intended only for the use of the designated recipient(s) named above. If the reader of this message is not the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution, or copying of this message is strictly prohibited. If you have received this communication in error, please notify the sender by telephone or e-mail (as shown above) immediately and destroy any and all copies of this message in your possession (whether hard copies or electronically stored copies). -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On Sun, Oct 26, 2014 at 02:46:03PM +0100, Alexandre DERUMIER wrote: Hi, some news: I have applied patches succefully on top of 3.18-rc1 kernel. But don't seem to help is my case. (I think that blk-mq is working because I don't see any io schedulers on rbd devices, as blk-mq don't support them actually). My main problem is that I can't reach more than around 5iops on 1 machine, and the problem seem to be the kworker process stuck at 100% of 1core. Can you do a perf report -ag and then a perf report to see where these cycles are spent? -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Hi Somnath, I have just tried with 2 rbd volumes with (rbd map -o noshare rbdvolume -p pool) (kernel 3.14), then a fio benchmark on both volumes at the same time but I don't seem to help. I have always the kworker process at 100%, and iops are 25000iops on each rbd volume. - Mail original - De: Somnath Roy somnath@sandisk.com À: Alexandre DERUMIER aderum...@odiso.com, Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Dimanche 26 Octobre 2014 20:08:42 Objet: RE: krbd blk-mq support ? Alexandre, Have you tried mapping different images on the same m/c with 'noshare' map option ? If not, it will not scale with increasing number of images (and thus mapped rbds) on a single m/c as they will share the same connection to cluster. Thanks Regards Somnath -Original Message- From: ceph-devel-ow...@vger.kernel.org [mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER Sent: Sunday, October 26, 2014 6:46 AM To: Christoph Hellwig Cc: Ceph Devel Subject: Re: krbd blk-mq support ? Hi, some news: I have applied patches succefully on top of 3.18-rc1 kernel. But don't seem to help is my case. (I think that blk-mq is working because I don't see any io schedulers on rbd devices, as blk-mq don't support them actually). My main problem is that I can't reach more than around 5iops on 1 machine, and the problem seem to be the kworker process stuck at 100% of 1core. I had tried multiple fio process, on differents rbd devices at the same time, and I'm always limited à 5iops. I'm sure that the ceph cluster is not the bottleneck, because if I launch another fio on another node at the same time, I can reach 5iops on each node, and both are limited by the kworker process. That's why I thinked that blk-mq could help, but it don't seem to be the case. Is this kworker cpu limitation a known bug ? Regards, Alexandre - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 14:27:47 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. Ok, Thanks ! I'll try them and see If I can improve qemu performance on a single drive with multiqueues. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 12:55:01 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html PLEASE NOTE: The information contained in this electronic mail message is intended only for the use of the designated recipient(s) named above. If the reader of this message is not the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution, or copying of this message is strictly prohibited. If you have received this communication in error, please notify the sender by telephone or e-mail (as shown above) immediately and destroy any and all copies of this message in your possession (whether hard copies or electronically stored copies). -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
Hi, some news: I have applied patches succefully on top of 3.18-rc1 kernel. But don't seem to help is my case. (I think that blk-mq is working because I don't see any io schedulers on rbd devices, as blk-mq don't support them actually). My main problem is that I can't reach more than around 5iops on 1 machine, and the problem seem to be the kworker process stuck at 100% of 1core. I had tried multiple fio process, on differents rbd devices at the same time, and I'm always limited à 5iops. I'm sure that the ceph cluster is not the bottleneck, because if I launch another fio on another node at the same time, I can reach 5iops on each node, and both are limited by the kworker process. That's why I thinked that blk-mq could help, but it don't seem to be the case. Is this kworker cpu limitation a known bug ? Regards, Alexandre - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 14:27:47 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. Ok, Thanks ! I'll try them and see If I can improve qemu performance on a single drive with multiqueues. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 12:55:01 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: krbd blk-mq support ?
Alexandre, Have you tried mapping different images on the same m/c with 'noshare' map option ? If not, it will not scale with increasing number of images (and thus mapped rbds) on a single m/c as they will share the same connection to cluster. Thanks Regards Somnath -Original Message- From: ceph-devel-ow...@vger.kernel.org [mailto:ceph-devel-ow...@vger.kernel.org] On Behalf Of Alexandre DERUMIER Sent: Sunday, October 26, 2014 6:46 AM To: Christoph Hellwig Cc: Ceph Devel Subject: Re: krbd blk-mq support ? Hi, some news: I have applied patches succefully on top of 3.18-rc1 kernel. But don't seem to help is my case. (I think that blk-mq is working because I don't see any io schedulers on rbd devices, as blk-mq don't support them actually). My main problem is that I can't reach more than around 5iops on 1 machine, and the problem seem to be the kworker process stuck at 100% of 1core. I had tried multiple fio process, on differents rbd devices at the same time, and I'm always limited à 5iops. I'm sure that the ceph cluster is not the bottleneck, because if I launch another fio on another node at the same time, I can reach 5iops on each node, and both are limited by the kworker process. That's why I thinked that blk-mq could help, but it don't seem to be the case. Is this kworker cpu limitation a known bug ? Regards, Alexandre - Mail original - De: Alexandre DERUMIER aderum...@odiso.com À: Christoph Hellwig h...@infradead.org Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 14:27:47 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. Ok, Thanks ! I'll try them and see If I can improve qemu performance on a single drive with multiqueues. - Mail original - De: Christoph Hellwig h...@infradead.org À: Alexandre DERUMIER aderum...@odiso.com Cc: Ceph Devel ceph-devel@vger.kernel.org Envoyé: Vendredi 24 Octobre 2014 12:55:01 Objet: Re: krbd blk-mq support ? If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html PLEASE NOTE: The information contained in this electronic mail message is intended only for the use of the designated recipient(s) named above. If the reader of this message is not the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution, or copying of this message is strictly prohibited. If you have received this communication in error, please notify the sender by telephone or e-mail (as shown above) immediately and destroy any and all copies of this message in your possession (whether hard copies or electronically stored copies).
krbd blk-mq support ?
Hi, I would like to known if it's planned to add blk-mq (block multiqueue from kernel 3.17) support to krbd ? I think it could help single threaded workload (including qemu) to reach more iops. I find some small discussion about it here: http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/20584 But no news since then. Regards, Alexandre -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
On Fri, Oct 24, 2014 at 11:54 AM, Alexandre DERUMIER aderum...@odiso.com wrote: Hi, I would like to known if it's planned to add blk-mq (block multiqueue from kernel 3.17) support to krbd ? I think it could help single threaded workload (including qemu) to reach more iops. I find some small discussion about it here: http://permalink.gmane.org/gmane.comp.file-systems.ceph.devel/20584 But no news since then. There are no concrete plans as of now. For 3.19 and 3.20 the main goal is to get fancy striping (support for custom striping modes) in and then get rid of kernel layering is EXPERIMENTAL! warning. krbd is a network block device, so I don't think we will gain anything significant in the performance department. blk-mq was mentioned because it lifts some of the implementation restrictions the current infrastructure imposes on drivers. Thanks, Ilya -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: krbd blk-mq support ?
If you're willing to experiment give the patches below a try, not that I don't have a ceph test cluster available, so the conversion is untestested. From 00668f00afc6f0cfbce05d1186116469c1f3f9b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig h...@lst.de Date: Fri, 24 Oct 2014 11:53:36 +0200 Subject: blk-mq: handle single queue case in blk_mq_hctx_next_cpu Don't duplicate the code to handle the not cpu bounce case in the caller, do it inside blk_mq_hctx_next_cpu instead. Signed-off-by: Christoph Hellwig h...@lst.de --- block/blk-mq.c | 34 +- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 68929ba..eaaedea 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -760,10 +760,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx-next_cpu; + if (hctx-queue-nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx-next_cpu_batch = 0) { - int next_cpu; + int cpu = hctx-next_cpu, next_cpu; next_cpu = cpumask_next(hctx-next_cpu, hctx-cpumask); if (next_cpu = nr_cpu_ids) @@ -771,9 +772,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx-next_cpu = next_cpu; hctx-next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx-next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) @@ -781,16 +784,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (unlikely(test_bit(BLK_MQ_S_STOPPED, hctx-state))) return; - if (!async cpumask_test_cpu(smp_processor_id(), hctx-cpumask)) + if (!async cpumask_test_cpu(smp_processor_id(), hctx-cpumask)) { __blk_mq_run_hw_queue(hctx); - else if (hctx-queue-nr_hw_queues == 1) - kblockd_schedule_delayed_work(hctx-run_work, 0); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, hctx-run_work, 0); + return; } + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + hctx-run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -888,16 +888,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx-queue-nr_hw_queues == 1) - kblockd_schedule_delayed_work(hctx-delay_work, tmo); - else { - unsigned int cpu; - - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, hctx-delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + hctx-delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); -- 1.9.1 From 6002e20c4d2b150fcbe82a7bc45c90d30cb61b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig h...@lst.de Date: Fri, 24 Oct 2014 12:04:07 +0200 Subject: blk-mq: allow direct dispatch to a driver specific workqueue We have various block drivers that need to execute long term blocking operations during I/O submission like file system or network I/O. Currently these drivers just queue up work to an internal workqueue from their request_fn. With blk-mq we can make sure they always get called on their own workqueue directly for I/O submission by: 1) adding a flag to prevent inline submission of I/O, and 2) allowing the driver to pass in a workqueue in the tag_set that will be used instead of kblockd. Signed-off-by: Christoph Hellwig h...@lst.de --- block/blk-core.c | 2 +- block/blk-mq.c | 12 +--- block/blk.h| 1 + include/linux/blk-mq.h | 4 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53..7f7249f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -61,7 +61,7 @@ struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ -static struct workqueue_struct *kblockd_workqueue; +struct workqueue_struct *kblockd_workqueue; void blk_queue_congestion_threshold(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index eaaedea..cea2f96 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -784,12 +784,13 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (unlikely(test_bit(BLK_MQ_S_STOPPED, hctx-state))) return; - if (!async cpumask_test_cpu(smp_processor_id(), hctx-cpumask)) { + if (!async !(hctx-flags BLK_MQ_F_WORKQUEUE) +