Re: Re: [PATCH 2/2] ceph: Implement writev/pwritev for sync operation.
Hi, Thank you for the patch. On 09/03/2013 04:52 PM, majianpeng wrote: For writev/pwritev sync-operatoin, ceph only do the first iov. It don't think other iovs.Now implement this. I divided the write-sync-operation into two functions.One for direct-write,other for none-direct-sync-write.This is because for none-direct-sync-write we can merge iovs to one.But for direct-write, we can't merge iovs. Signed-off-by: Jianpeng Ma majianp...@gmail.com --- fs/ceph/file.c | 328 +++-- 1 file changed, 248 insertions(+), 80 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d6a3ee..42c97b3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -533,17 +533,19 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) { +struct file *file = iocb-ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -557,59 +559,55 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int written = 0; int flags; int check_caps = 0; -int page_align, io_align; -unsigned long buf_align; -int ret; +int page_align; +int ret, i; struct timespec mtime = CURRENT_TIME; -bool own_pages = false; +loff_t pos = iocb-ki_pos; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; -dout(sync_write on file %p %lld~%u %s\n, file, pos, - (unsigned)left, (file-f_flags O_DIRECT) ? O_DIRECT : ); +dout(sync_direct_write on file %p %lld~%u\n, file, pos, + (unsigned)count); -ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + left); +ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + count); if (ret 0) return ret; ret = invalidate_inode_pages2_range(inode-i_mapping, pos PAGE_CACHE_SHIFT, -(pos + left) PAGE_CACHE_SHIFT); +(pos + count) PAGE_CACHE_SHIFT); if (ret 0) dout(invalidate_inode_pages2_range returned %d\n, ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; -if ((file-f_flags (O_SYNC|O_DIRECT)) == 0) -flags |= CEPH_OSD_FLAG_ACK; -else -num_ops++; /* Also include a 'startsync' command. */ +num_ops++; /* Also include a 'startsync' command. */ -/* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: -io_align = pos ~PAGE_MASK; -buf_align = (unsigned long)data ~PAGE_MASK; -len = left; +for (i = 0; i nr_segs count; i++) { POSIX requires that write syscall is atomic. I means we should allocate a single OSD request for all buffer segments that belong to the same object. I think we could not. For direct write, we use ceph_get_direct_page_vector to get pages. Given iov1 and iov2 are in the same object. But we can't make the pages of iov1/2 to join together. Because for ceph page_vector,it only record the offset of first page. Or am i missing something? Maybe we can use ceph pagelist but it will copy data. Thanks! Jianpeng Ma Regards Yan, Zheng
Re: [PATCH 2/2] ceph: Implement writev/pwritev for sync operation.
On 09/06/2013 08:46 AM, majianpeng wrote: Hi, Thank you for the patch. On 09/03/2013 04:52 PM, majianpeng wrote: For writev/pwritev sync-operatoin, ceph only do the first iov. It don't think other iovs.Now implement this. I divided the write-sync-operation into two functions.One for direct-write,other for none-direct-sync-write.This is because for none-direct-sync-write we can merge iovs to one.But for direct-write, we can't merge iovs. Signed-off-by: Jianpeng Ma majianp...@gmail.com --- fs/ceph/file.c | 328 +++-- 1 file changed, 248 insertions(+), 80 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d6a3ee..42c97b3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -533,17 +533,19 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) { + struct file *file = iocb-ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -557,59 +559,55 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; - int ret; + int page_align; + int ret, i; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb-ki_pos; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout(sync_write on file %p %lld~%u %s\n, file, pos, -(unsigned)left, (file-f_flags O_DIRECT) ? O_DIRECT : ); + dout(sync_direct_write on file %p %lld~%u\n, file, pos, +(unsigned)count); - ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + count); if (ret 0) return ret; ret = invalidate_inode_pages2_range(inode-i_mapping, pos PAGE_CACHE_SHIFT, - (pos + left) PAGE_CACHE_SHIFT); + (pos + count) PAGE_CACHE_SHIFT); if (ret 0) dout(invalidate_inode_pages2_range returned %d\n, ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file-f_flags (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ + num_ops++; /* Also include a 'startsync' command. */ - /* -* we may need to do multiple writes here if we span an object -* boundary. this isn't atomic, unfortunately. :( -*/ -more: - io_align = pos ~PAGE_MASK; - buf_align = (unsigned long)data ~PAGE_MASK; - len = left; + for (i = 0; i nr_segs count; i++) { POSIX requires that write syscall is atomic. I means we should allocate a single OSD request for all buffer segments that belong to the same object. I think we could not. For direct write, we use ceph_get_direct_page_vector to get pages. Given iov1 and iov2 are in the same object. But we can't make the pages of iov1/2 to join together. Because for ceph page_vector,it only record the offset of first page. Or am i missing something? Maybe we can use ceph pagelist but it will copy data. I'm wrong with the direct IO case (ext4 doesn't guarantee atomicity in direct write). But please keep buffered write atomic. Regards Yan, Zheng -- To unsubscribe from this list: send the line unsubscribe ceph-devel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] ceph: Implement writev/pwritev for sync operation.
Hi, Thank you for the patch. On 09/03/2013 04:52 PM, majianpeng wrote: For writev/pwritev sync-operatoin, ceph only do the first iov. It don't think other iovs.Now implement this. I divided the write-sync-operation into two functions.One for direct-write,other for none-direct-sync-write.This is because for none-direct-sync-write we can merge iovs to one.But for direct-write, we can't merge iovs. Signed-off-by: Jianpeng Ma majianp...@gmail.com --- fs/ceph/file.c | 328 +++-- 1 file changed, 248 insertions(+), 80 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d6a3ee..42c97b3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -533,17 +533,19 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, -size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, +unsigned long nr_segs, size_t count) { + struct file *file = iocb-ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -557,59 +559,55 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; - int ret; + int page_align; + int ret, i; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb-ki_pos; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout(sync_write on file %p %lld~%u %s\n, file, pos, - (unsigned)left, (file-f_flags O_DIRECT) ? O_DIRECT : ); + dout(sync_direct_write on file %p %lld~%u\n, file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + count); if (ret 0) return ret; ret = invalidate_inode_pages2_range(inode-i_mapping, pos PAGE_CACHE_SHIFT, - (pos + left) PAGE_CACHE_SHIFT); + (pos + count) PAGE_CACHE_SHIFT); if (ret 0) dout(invalidate_inode_pages2_range returned %d\n, ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file-f_flags (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ + num_ops++; /* Also include a 'startsync' command. */ - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos ~PAGE_MASK; - buf_align = (unsigned long)data ~PAGE_MASK; - len = left; + for (i = 0; i nr_segs count; i++) { POSIX requires that write syscall is atomic. I means we should allocate a single OSD request for all buffer segments that belong to the same object. Regards Yan, Zheng + void __user *data = iov[i].iov_base; + size_t left; - snapc = ci-i_snap_realm-cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(fsc-client-osdc, ci-i_layout, - vino, pos, len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci-i_truncate_seq, ci-i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + left = min(count, iov[i].iov_len); +more: + page_align = (unsigned long)data ~PAGE_MASK; + len = left; + + snapc = ci-i_snap_realm-cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(fsc-client-osdc, ci-i_layout, + vino, pos, len, num_ops, + CEPH_OSD_OP_WRITE, flags, snapc, + ci-i_truncate_seq, + ci-i_truncate_size, +
Re: [PATCH 2/2] ceph: Implement writev/pwritev for sync operation.
Hi, Thank you for the patch. On 09/03/2013 04:52 PM, majianpeng wrote: For writev/pwritev sync-operatoin, ceph only do the first iov. It don't think other iovs.Now implement this. I divided the write-sync-operation into two functions.One for direct-write,other for none-direct-sync-write.This is because for none-direct-sync-write we can merge iovs to one.But for direct-write, we can't merge iovs. Signed-off-by: Jianpeng Ma majianp...@gmail.com --- fs/ceph/file.c | 328 +++-- 1 file changed, 248 insertions(+), 80 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d6a3ee..42c97b3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -533,17 +533,19 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, -size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, +unsigned long nr_segs, size_t count) { + struct file *file = iocb-ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -557,59 +559,55 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; - int ret; + int page_align; + int ret, i; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb-ki_pos; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout(sync_write on file %p %lld~%u %s\n, file, pos, - (unsigned)left, (file-f_flags O_DIRECT) ? O_DIRECT : ); + dout(sync_direct_write on file %p %lld~%u\n, file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode-i_mapping, pos, pos + count); if (ret 0) return ret; ret = invalidate_inode_pages2_range(inode-i_mapping, pos PAGE_CACHE_SHIFT, - (pos + left) PAGE_CACHE_SHIFT); + (pos + count) PAGE_CACHE_SHIFT); if (ret 0) dout(invalidate_inode_pages2_range returned %d\n, ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file-f_flags (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ + num_ops++; /* Also include a 'startsync' command. */ - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos ~PAGE_MASK; - buf_align = (unsigned long)data ~PAGE_MASK; - len = left; + for (i = 0; i nr_segs count; i++) { POSIX requires that write syscall is atomic. I means we should allocate a single OSD request for all buffer segments that belong to the same object. Regards Yan, Zheng + void __user *data = iov[i].iov_base; + size_t left; - snapc = ci-i_snap_realm-cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(fsc-client-osdc, ci-i_layout, - vino, pos, len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci-i_truncate_seq, ci-i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + left = min(count, iov[i].iov_len); +more: + page_align = (unsigned long)data ~PAGE_MASK; + len = left; + + snapc = ci-i_snap_realm-cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(fsc-client-osdc, ci-i_layout, + vino, pos, len, num_ops, + CEPH_OSD_OP_WRITE, flags, snapc, + ci-i_truncate_seq, + ci-i_truncate_size, +