Re: [PATCH] rbd: use writefull op for object size writes

2015-10-16 Thread Alex Elder
On 10/07/2015 12:02 PM, Ilya Dryomov wrote:
> This covers only the simplest case - an object size sized write, but
> it's still useful in tiering setups when EC is used for the base tier
> as writefull op can be proxied, saving an object promotion.
> 
> Even though updating ceph_osdc_new_request() to allow writefull should
> just be a matter of fixing an assert, I didn't do it because its only
> user is cephfs.  All other sites were updated.
> 
> Reflects ceph.git commit 7bfb7f9025a8ee0d2305f49bf0336d2424da5b5b.
> 
> Signed-off-by: Ilya Dryomov 

Looks good to me.

Reviewed-by: Alex Elder 

> ---
>  drivers/block/rbd.c   |  9 +++--
>  net/ceph/osd_client.c | 13 +
>  2 files changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 04e69b4df664..cd00e4653e49 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1863,9 +1863,11 @@ static void rbd_osd_req_callback(struct 
> ceph_osd_request *osd_req,
>   rbd_osd_read_callback(obj_request);
>   break;
>   case CEPH_OSD_OP_SETALLOCHINT:
> - rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
> + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
> +osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
>   /* fall through */
>   case CEPH_OSD_OP_WRITE:
> + case CEPH_OSD_OP_WRITEFULL:
>   rbd_osd_write_callback(obj_request);
>   break;
>   case CEPH_OSD_OP_STAT:
> @@ -2401,7 +2403,10 @@ static void rbd_img_obj_request_fill(struct 
> rbd_obj_request *obj_request,
>   opcode = CEPH_OSD_OP_ZERO;
>   }
>   } else if (op_type == OBJ_OP_WRITE) {
> - opcode = CEPH_OSD_OP_WRITE;
> + if (!offset && length == object_size)
> + opcode = CEPH_OSD_OP_WRITEFULL;
> + else
> + opcode = CEPH_OSD_OP_WRITE;
>   osd_req_op_alloc_hint_init(osd_request, num_ops,
>   object_size, object_size);
>   num_ops++;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 80b94e37c94a..f79ccac6699f 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -285,6 +285,7 @@ static void osd_req_op_data_release(struct 
> ceph_osd_request *osd_req,
>   switch (op->op) {
>   case CEPH_OSD_OP_READ:
>   case CEPH_OSD_OP_WRITE:
> + case CEPH_OSD_OP_WRITEFULL:
>   ceph_osd_data_release(&op->extent.osd_data);
>   break;
>   case CEPH_OSD_OP_CALL:
> @@ -485,13 +486,14 @@ void osd_req_op_extent_init(struct ceph_osd_request 
> *osd_req,
>   size_t payload_len = 0;
>  
>   BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
> -opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE);
> +opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
> +opcode != CEPH_OSD_OP_TRUNCATE);
>  
>   op->extent.offset = offset;
>   op->extent.length = length;
>   op->extent.truncate_size = truncate_size;
>   op->extent.truncate_seq = truncate_seq;
> - if (opcode == CEPH_OSD_OP_WRITE)
> + if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
>   payload_len += length;
>  
>   op->payload_len = payload_len;
> @@ -670,9 +672,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request 
> *req,
>   break;
>   case CEPH_OSD_OP_READ:
>   case CEPH_OSD_OP_WRITE:
> + case CEPH_OSD_OP_WRITEFULL:
>   case CEPH_OSD_OP_ZERO:
>   case CEPH_OSD_OP_TRUNCATE:
> - if (src->op == CEPH_OSD_OP_WRITE)
> + if (src->op == CEPH_OSD_OP_WRITE ||
> + src->op == CEPH_OSD_OP_WRITEFULL)
>   request_data_len = src->extent.length;
>   dst->extent.offset = cpu_to_le64(src->extent.offset);
>   dst->extent.length = cpu_to_le64(src->extent.length);
> @@ -681,7 +685,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>   dst->extent.truncate_seq =
>   cpu_to_le32(src->extent.truncate_seq);
>   osd_data = &src->extent.osd_data;
> - if (src->op == CEPH_OSD_OP_WRITE)
> + if (src->op == CEPH_OSD_OP_WRITE ||
> + src->op == CEPH_OSD_OP_WRITEFULL)
>   ceph_osdc_msg_data_add(req->r_request, osd_data);
>   else
>   ceph_osdc_msg_data_add(req->r_reply, osd_data);
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rbd: use writefull op for object size writes

2015-10-07 Thread Ilya Dryomov
On Wed, Oct 7, 2015 at 5:36 PM, Alex Elder  wrote:
> On 10/07/2015 12:02 PM, Ilya Dryomov wrote:
>> This covers only the simplest case - an object size sized write, but
>> it's still useful in tiering setups when EC is used for the base tier
>> as writefull op can be proxied, saving an object promotion.
>>
>> Even though updating ceph_osdc_new_request() to allow writefull should
>> just be a matter of fixing an assert, I didn't do it because its only
>> user is cephfs.  All other sites were updated.
>>
>> Reflects ceph.git commit 7bfb7f9025a8ee0d2305f49bf0336d2424da5b5b.
>
> I haven't looked at this at all.  But can you give me a
> short explanation of what "writefull" is?
>
> Full object write?

Well, in a way.  It replaces previous data, you can think of it as an
atomic truncate to 0 + write from offset 0.  So it always writes
(replaces) entire objects.

Thanks,

Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html