Hi all, Attached patch cleans up a long overdue (my bad) need for consolidating the I/O paths for read, write and readv, writev. The aio read and write parts also need consolidation into a common routine which is not done in this patch. I figured I might as well do this now since readx, writex system calls also require similar consolidation. Note: This does not (yet) consolidate readv, writev paths to make use of the read, write path. Comments welcome, Thanks, Murali
Index: src/apps/admin/pvfs2-ping.c =================================================================== RCS file: /anoncvs/pvfs2/src/apps/admin/pvfs2-ping.c,v retrieving revision 1.47 diff -u -r1.47 pvfs2-ping.c --- src/apps/admin/pvfs2-ping.c 25 May 2006 18:14:14 -0000 1.47 +++ src/apps/admin/pvfs2-ping.c 26 May 2006 02:59:13 -0000 @@ -223,9 +223,12 @@ PVFS_error_details_free(error_details); - /* if we hit this point, then everything is ok */ - printf(" Ok; root handle is owned by exactly one server.\n"); - printf("\n"); + if (!err) + { + /* if we hit this point, then everything is ok */ + printf(" Ok; root handle is owned by exactly one server.\n"); + printf("\n"); + } PVFS_sys_finalize(); Index: src/kernel/linux-2.6/file.c =================================================================== RCS file: /anoncvs/pvfs2/src/kernel/linux-2.6/file.c,v retrieving revision 1.114 diff -u -r1.114 file.c --- src/kernel/linux-2.6/file.c 4 Apr 2006 15:04:28 -0000 1.114 +++ src/kernel/linux-2.6/file.c 26 May 2006 02:59:20 -0000 @@ -17,6 +17,13 @@ #include <linux/fs.h> #include <linux/pagemap.h> +enum { + IO_READ = 0, + IO_WRITE = 1, + IO_READV = 0, + IO_WRITEV = 1, +}; + extern struct list_head pvfs2_request_list; extern spinlock_t pvfs2_request_list_lock; extern wait_queue_head_t pvfs2_request_list_waitq; @@ -101,53 +108,127 @@ return ret; } -/** Read data from a specified offset in a file (referenced by inode). - * Data may be placed either in a user or kernel buffer. - */ -ssize_t pvfs2_inode_read( - struct inode *inode, - char __user *buf, - size_t count, - loff_t *offset, - int copy_to_user, - loff_t readahead_size) +struct rw_options { + int type; + /* sigh.. we will never pass sparse type checks.. */ + char *buf; + size_t count; + loff_t *offset; + union { + struct { + struct inode *inode; + int copy_to_user; + loff_t readahead_size; + } read; + struct { + struct file *file; + } write; + } io; +}; + +static ssize_t do_read_write(struct rw_options *rw) { - int ret = -1; - size_t each_count = 0, amt_complete = 0; - size_t total_count = 0; pvfs2_kernel_op_t *new_op = NULL; int buffer_index = -1; - char *current_buf = buf; - loff_t original_offset = *offset; - pvfs2_inode_t *pvfs2_inode = PVFS2_I(inode); + struct inode *inode; + pvfs2_inode_t *pvfs2_inode = NULL; + char *current_buf = NULL; + size_t count; + loff_t *offset; + ssize_t ret; + ssize_t total_count; + char *fnstr = NULL; + size_t readahead_size; + int copy_to_user; + struct file *file; - if (copy_to_user && (!access_ok(VERIFY_WRITE, buf, count))) - return -EFAULT; + total_count = 0; + ret = -EINVAL; + file = NULL; + inode = NULL; + if (!rw) + goto out; + count = rw->count; + current_buf = (char *) rw->buf; + if (!current_buf) + goto out; + offset = rw->offset; + if (!offset) + goto out; + if (rw->type == IO_READ) + { + inode = rw->io.read.inode; + if (!inode) + goto out; + file = NULL; + copy_to_user = rw->io.read.copy_to_user; + ret = -EFAULT; + if (copy_to_user && + !access_ok(VERIFY_WRITE, (char __user *) current_buf, count)) + goto out; + fnstr = "pvfs2_file_read"; + readahead_size = rw->io.read.readahead_size; + } + else + { + file = rw->io.write.file; + copy_to_user = 1; + readahead_size = 0; + if (!file) + goto out; + inode = file->f_dentry->d_inode; + if (!inode) + goto out; + fnstr = "pvfs2_file_write"; + ret = -EFAULT; + if (!access_ok(VERIFY_READ, (char __user *) current_buf, count)) + goto out; + if(file->f_pos > i_size_read(inode)) + { + i_size_write(inode, file->f_pos); + } + /* perform generic linux kernel tests for sanity of write arguments */ + /* NOTE: this is particularly helpful in handling fsize rlimit properly */ +#ifdef PVFS2_LINUX_KERNEL_2_4 + ret = pvfs2_precheck_file_write(file, inode, &count, offset); +#else + ret = generic_write_checks(file, offset, &count, S_ISBLK(inode->i_mode)); +#endif + if (ret != 0 || count == 0) + { + pvfs2_print("pvfs2_file_write: failed generic argument checks.\n"); + goto out; + } + pvfs2_print("%s: proceeding with offset : %ld, size %ld\n", + fnstr, (unsigned long) *offset, (unsigned long) count); + } + pvfs2_inode = PVFS2_I(inode); while(total_count < count) { + size_t each_count, amt_complete; + new_op = op_alloc(); if (!new_op) { - return -ENOMEM; + ret = -ENOMEM; + goto out; } new_op->upcall.type = PVFS2_VFS_OP_FILE_IO; new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO; /* synchronous I/O */ new_op->upcall.req.io.readahead_size = readahead_size; - new_op->upcall.req.io.io_type = PVFS_IO_READ; + new_op->upcall.req.io.io_type = + (rw->type == IO_READ) ? PVFS_IO_READ : PVFS_IO_WRITE; new_op->upcall.req.io.refn = pvfs2_inode->refn; ret = pvfs_bufmap_get(&buffer_index); if (ret < 0) { - pvfs2_error("pvfs2_inode_read: pvfs_bufmap_get() " - "failure (%d)\n", ret); - op_release(new_op); - *offset = original_offset; - return ret; + pvfs2_error("do_read_write: pvfs_bufmap_get() " + "failure (%ld)\n", (long) ret); + goto out; } - /* how much to transfer in this loop iteration */ each_count = (((count - total_count) > pvfs_bufmap_size_query()) ? pvfs_bufmap_size_query() : (count - total_count)); @@ -155,9 +236,18 @@ new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = each_count; new_op->upcall.req.io.offset = *offset; - + if (rw->type == IO_WRITE) + { + /* copy data from application */ + ret = pvfs_bufmap_copy_from_user(buffer_index, current_buf, each_count); + if(ret < 0) + { + pvfs2_print("%s: Failed to copy user buffer.\n", fnstr); + goto out; + } + } ret = service_operation( - new_op, "pvfs2_inode_read", PVFS2_OP_RETRY_COUNT, + new_op, fnstr, PVFS2_OP_RETRY_COUNT, get_interruptible_flag(inode)); if (ret < 0) @@ -170,53 +260,54 @@ termination unless we've got debugging turned on, as this can happen regularly (i.e. ctrl-c) */ - if(ret == -EINTR) + if (ret == -EINTR) { - pvfs2_print("pvfs2_inode_read: returning error %d\n", ret); + pvfs2_print("%s: returning error %ld\n", fnstr, (long) ret); } else { pvfs2_error( - "pvfs2_inode_read: error reading from handle %llu, " - "\n -- returning %d \n", - llu(pvfs2_ino_to_handle(inode->i_ino)), ret); + "%s: error writing to handle %llu, " + "-- returning %ld\n", + fnstr, + llu(pvfs2_ino_to_handle(inode->i_ino)), + (long) ret); } - return ret; + goto out; } - - /* copy data out to destination */ - if (new_op->downcall.resp.io.amt_complete) + if (rw->type == IO_READ) { - if (copy_to_user) - { - ret = pvfs_bufmap_copy_to_user( - current_buf, buffer_index, - new_op->downcall.resp.io.amt_complete); - } - else + /* copy data out to destination */ + if (new_op->downcall.resp.io.amt_complete) { - ret = pvfs_bufmap_copy_to_kernel( - current_buf, buffer_index, - new_op->downcall.resp.io.amt_complete); - } - - if (ret) - { - pvfs2_print("Failed to copy user buffer.\n"); - /* put error code in downcall so that handle_io_error() - * preserves properly - */ - new_op->downcall.status = ret; - handle_io_error(); - return(ret); + if (copy_to_user) + { + ret = pvfs_bufmap_copy_to_user( + current_buf, buffer_index, + new_op->downcall.resp.io.amt_complete); + } + else + { + ret = pvfs_bufmap_copy_to_kernel( + current_buf, buffer_index, + new_op->downcall.resp.io.amt_complete); + } + if (ret) + { + pvfs2_print("Failed to copy user buffer.\n"); + /* put error code in downcall so that handle_io_error() + * preserves properly + */ + new_op->downcall.status = ret; + handle_io_error(); + goto out; + } } } - current_buf += new_op->downcall.resp.io.amt_complete; *offset += new_op->downcall.resp.io.amt_complete; total_count += new_op->downcall.resp.io.amt_complete; amt_complete = new_op->downcall.resp.io.amt_complete; - /* tell the device file owner waiting on I/O that this read has completed and it can return now. in this exact case, on @@ -224,9 +315,10 @@ after this. */ wake_up_device_for_return(new_op); + new_op = NULL; pvfs_bufmap_put(buffer_index); - - /* if we got a short read, fall out and return what we + buffer_index = -1; + /* if we got a short read/write, fall out and return what we * got so far */ if (amt_complete < each_count) @@ -234,12 +326,45 @@ break; } } + if (total_count > 0) { + ret = total_count; + } +out: + if (new_op) + op_release(new_op); + if (buffer_index >= 0) + pvfs_bufmap_put(buffer_index); + if (ret > 0 && file != NULL && inode != NULL) + { +#ifdef HAVE_TOUCH_ATIME + touch_atime(file->f_vfsmnt, file->f_dentry); +#else + update_atime(inode); +#endif + } + return ret; +} - /* - NOTE: for this special case, op is freed by devreq_writev and - *not* here. - */ - return(total_count); +/** Read data from a specified offset in a file (referenced by inode). + * Data may be placed either in a user or kernel buffer. + */ +ssize_t pvfs2_inode_read( + struct inode *inode, + char __user *buf, + size_t count, + loff_t *offset, + int copy_to_user, + loff_t readahead_size) +{ + struct rw_options rw; + rw.type = IO_READ; + rw.buf = buf; + rw.count = count; + rw.offset = offset; + rw.io.read.inode = inode; + rw.io.read.copy_to_user = copy_to_user; + rw.io.read.readahead_size = readahead_size; + return do_read_write(&rw); } /** Read data from a specified offset in a file into a user buffer. @@ -268,160 +393,13 @@ size_t count, loff_t *offset) { - int ret = -1; - pvfs2_kernel_op_t *new_op = NULL; - char __user *current_buf = (char __user *)buf; - loff_t original_offset = *offset; - int buffer_index = -1; - size_t each_count = 0, total_count = 0; - struct inode *inode = file->f_dentry->d_inode; - pvfs2_inode_t *pvfs2_inode = PVFS2_I(inode); - size_t amt_complete = 0; - - pvfs2_print("pvfs2_file_write: called on %s [f_pos %ld off %ld size %ld]\n", - (file && file->f_dentry && file->f_dentry->d_name.name ? - (char *)file->f_dentry->d_name.name : "UNKNOWN"), - (unsigned long) file->f_pos, - (unsigned long) *offset, (unsigned long) count); - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - if(file->f_pos > i_size_read(inode)) - { - i_size_write(inode, file->f_pos); - } - - /* perform generic linux kernel tests for sanity of write arguments */ - /* NOTE: this is particularly helpful in handling fsize rlimit properly */ -#ifdef PVFS2_LINUX_KERNEL_2_4 - ret = pvfs2_precheck_file_write(file, inode, &count, offset); -#else - ret = generic_write_checks(file, offset, &count, S_ISBLK(inode->i_mode)); -#endif - if (ret != 0 || count == 0) - { - pvfs2_print("pvfs2_file_write: failed generic argument checks.\n"); - return(ret); - } - - pvfs2_print("pvfs2_file_write: proceeding with offset : %ld, size %ld\n", - (unsigned long) *offset, (unsigned long) count); - - while(total_count < count) - { - new_op = op_alloc(); - if (!new_op) - { - *offset = original_offset; - return -ENOMEM; - } - - new_op->upcall.type = PVFS2_VFS_OP_FILE_IO; - new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO; /* synchronous I/O */ - new_op->upcall.req.io.io_type = PVFS_IO_WRITE; - new_op->upcall.req.io.refn = pvfs2_inode->refn; - - pvfs2_print("pvfs2_file_write: writing %d bytes at offset %lu (%lu)\n", - (int)count, (unsigned long)file->f_pos, (unsigned long)*offset); - - ret = pvfs_bufmap_get(&buffer_index); - if (ret < 0) - { - pvfs2_error("pvfs2_file_write: pvfs_bufmap_get() " - "failure (%d)\n", ret); - op_release(new_op); - *offset = original_offset; - return ret; - } - - /* how much to transfer in this loop iteration */ - each_count = (((count - total_count) > pvfs_bufmap_size_query()) ? - pvfs_bufmap_size_query() : (count - total_count)); - - new_op->upcall.req.io.buf_index = buffer_index; - new_op->upcall.req.io.count = each_count; - new_op->upcall.req.io.offset = *offset; - - /* copy data from application */ - ret = pvfs_bufmap_copy_from_user( - buffer_index, current_buf, each_count); - if(ret < 0) - { - pvfs2_print("Failed to copy user buffer.\n"); - op_release(new_op); - pvfs_bufmap_put(buffer_index); - *offset = original_offset; - return ret; - } - - ret = service_operation( - new_op, "pvfs2_file_write", PVFS2_OP_RETRY_COUNT, - get_interruptible_flag(inode)); - - if (ret < 0) - { - /* this macro is defined in pvfs2-kernel.h */ - handle_io_error(); - - /* - don't write an error to syslog on signaled operation - termination unless we've got debugging turned on, as - this can happen regularly (i.e. ctrl-c) - */ - if (ret == -EINTR) - { - pvfs2_print("pvfs2_file_write: returning error %d\n", ret); - } - else - { - pvfs2_error( - "pvfs2_file_write: error writing to handle %llu, " - "FILE: %s\n -- returning %d\n", - llu(pvfs2_ino_to_handle(inode->i_ino)), - (file && file->f_dentry && file->f_dentry->d_name.name ? - (char *)file->f_dentry->d_name.name : "UNKNOWN"), - ret); - } - *offset = original_offset; - return ret; - } - - current_buf += new_op->downcall.resp.io.amt_complete; - pvfs2_print("amt_complete = %ld\n", (unsigned long) new_op->downcall.resp.io.amt_complete); - *offset += new_op->downcall.resp.io.amt_complete; - total_count += new_op->downcall.resp.io.amt_complete; - amt_complete = new_op->downcall.resp.io.amt_complete; - - /* - tell the device file owner waiting on I/O that this read has - completed and it can return now. in this exact case, on - wakeup the device will free the op, so we *cannot* touch it - after this. - */ - wake_up_device_for_return(new_op); - pvfs_bufmap_put(buffer_index); - - /* if we got a short write, fall out and return what we got so - * far TODO: define semantics here- kind of depends on pvfs2 - * semantics that don't really exist yet - */ - if (amt_complete < each_count) - { - break; - } - } - pvfs2_print("pvfs2_file_write: pos at the end was %lu(%lu)\n", - (unsigned long) *offset, (unsigned long) file->f_pos); - if (total_count) - { -#ifdef HAVE_TOUCH_ATIME - touch_atime(file->f_vfsmnt, file->f_dentry); -#else - update_atime(inode); -#endif - } - return total_count; + struct rw_options rw; + rw.type = IO_WRITE; + rw.buf = (char *) buf; + rw.count = count; + rw.offset = offset; + rw.io.write.file = file; + return do_read_write(&rw); } /* @@ -563,37 +541,38 @@ } -/** Reads data to several contiguous user buffers (an iovec) from a file at a - * specified offset. - */ -static ssize_t pvfs2_file_readv( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *offset) +static ssize_t do_readv_writev(int type, struct file *file, + const struct iovec *iov, unsigned long nr_segs, loff_t *offset) { - int ret = -1; - pvfs2_kernel_op_t *new_op = NULL; - struct iovec *iovecptr = NULL, *ptr = NULL; - loff_t original_offset = *offset; - int buffer_index = -1; + ssize_t ret; + unsigned int to_free; + unsigned long seg; + ssize_t total_count, count; + size_t each_count; struct inode *inode = file->f_dentry->d_inode; pvfs2_inode_t *pvfs2_inode = PVFS2_I(inode); - size_t amt_complete = 0; - size_t total_count = 0, count = 0, each_count = 0; - unsigned int seg, to_free = 0; unsigned long new_nr_segs = 0, max_new_nr_segs = 0; - unsigned int seg_count, *seg_array = NULL; - + unsigned int seg_count = 0, *seg_array = NULL; + struct iovec *iovecptr = NULL, *ptr = NULL; + pvfs2_kernel_op_t *new_op = NULL; + int buffer_index = -1; + size_t amt_complete = 0; + char *fnstr = (type == IO_READV) ? "pvfs2_file_readv" : "pvfs2_file_writev"; - /* Calculate the total length to read by adding up the length of each io - * segment */ + ret = -EINVAL; + total_count = 0; + count = 0; + to_free = 0; + /* + * Calculate the total length to read/write by adding up the + * lengths of each io segment + */ for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; count += iv->iov_len; if (unlikely((ssize_t)(count|iv->iov_len) < 0)) - return -EINVAL; + goto out; if (total_count + iv->iov_len < pvfs_bufmap_size_query()) { total_count += iv->iov_len; @@ -602,7 +581,22 @@ else { total_count = (total_count + iv->iov_len - pvfs_bufmap_size_query()); - max_new_nr_segs+=2; + max_new_nr_segs += 2; + } + } + if (type == IO_WRITEV) + { + /* perform generic linux kernel tests for sanity of write arguments */ + /* NOTE: this is particularly helpful in handling fsize rlimit properly */ +#ifdef PVFS2_LINUX_KERNEL_2_4 + ret = pvfs2_precheck_file_write(file, inode, &count, offset); +#else + ret = generic_write_checks(file, offset, &count, S_ISBLK(inode->i_mode)); +#endif + if (ret != 0 || count == 0) + { + pvfs2_print("%s: failed generic argument checks.\n", fnstr); + goto out; } } total_count = 0; @@ -625,12 +619,12 @@ &new_nr_segs, &iovecptr, /* OUT */ &seg_count, &seg_array) /* OUT */ ) < 0) { - pvfs2_error("Failed to split iovecs to satisfy larger " - " than blocksize readv request %d\n", ret); - return ret; + pvfs2_error("%s: Failed to split iovecs to satisfy larger " + " than blocksize readv/writev request %d\n", fnstr, ret); + goto out; } - pvfs2_print("pvfs_file_readv: Splitting iovecs from %lu to %lu [max_new %lu]\n", - nr_segs, new_nr_segs, max_new_nr_segs); + pvfs2_print("%s: Splitting iovecs from %lu to %lu [max_new %lu]\n", + fnstr, nr_segs, new_nr_segs, max_new_nr_segs); /* We must free seg_array and iovecptr */ to_free = 1; } @@ -646,53 +640,48 @@ to_free = 0; } ptr = iovecptr; - pvfs2_print("pvfs2_file_readv reading [EMAIL PROTECTED]", (int) count, lld(*offset)); - pvfs2_print("pvfs2_file_readv: new_nr_segs: %lu, seg_count: %u\n", - new_nr_segs, seg_count); + + pvfs2_print("%s [EMAIL PROTECTED]", fnstr, (int) count, *offset); + pvfs2_print("%s: new_nr_segs: %lu, seg_count: %u\n", + fnstr, new_nr_segs, seg_count); +#ifdef PVFS2_KERNEL_DEBUG for (seg = 0; seg < new_nr_segs; seg++) { - pvfs2_print("pvfs2_file_readv: %d) %p to %p [%d bytes]\n", + pvfs2_print("%s: %d) %p to %p [%d bytes]\n", + fnstr, seg + 1, iovecptr[seg].iov_base, iovecptr[seg].iov_base + iovecptr[seg].iov_len, (int) iovecptr[seg].iov_len); } for (seg = 0; seg < seg_count; seg++) { - pvfs2_print("pvfs2_file_readv: %d) %u\n", seg + 1, seg_array[seg]); + pvfs2_print("%s: %d) %u\n", fnstr, seg + 1, seg_array[seg]); } +#endif seg = 0; while (total_count < count) { new_op = op_alloc(); if (!new_op) { - *offset = original_offset; - if (to_free) { - kfree(iovecptr); - kfree(seg_array); - } - return -ENOMEM; + ret = -ENOMEM; + goto out; } - new_op->upcall.type = PVFS2_VFS_OP_FILE_IO; new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO; /* synchronous I/O */ /* disable read-ahead */ new_op->upcall.req.io.readahead_size = 0; - new_op->upcall.req.io.io_type = PVFS_IO_READ; + new_op->upcall.req.io.io_type = + (type == IO_READV) ? PVFS_IO_READ : PVFS_IO_WRITE; new_op->upcall.req.io.refn = pvfs2_inode->refn; + /* get a shared buffer index */ ret = pvfs_bufmap_get(&buffer_index); if (ret < 0) { - pvfs2_error("pvfs2_file_readv: pvfs_bufmap_get() " - "failure (%d)\n", ret); - op_release(new_op); - *offset = original_offset; - if (to_free) { - kfree(iovecptr); - kfree(seg_array); - } - return ret; + pvfs2_error("%s: pvfs_bufmap_get() " + "failure (%d)\n", fnstr, ret); + goto out; } /* how much to transfer in this loop iteration */ @@ -702,10 +691,28 @@ new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = each_count; new_op->upcall.req.io.offset = *offset; - - ret = service_operation( - new_op, "pvfs2_file_readv", PVFS2_OP_RETRY_COUNT, - get_interruptible_flag(inode)); + if (type == IO_WRITEV) + { + /* + * copy data from application by pulling it out of the iovec. + * Number of segments to copy so that we don't overflow the block-size + * is set in seg_array[], and ptr points to the appropriate + * beginning of the iovec from where data needs to be copied out, + * and each_count indicates the size in bytes that needs to be pulled + * out. */ + pvfs2_print("%s nr_segs %u, offset: %llu each_count: %d\n", + fnstr, seg_array[seg], *offset, (int) each_count); + ret = pvfs_bufmap_copy_iovec_from_user( + buffer_index, ptr, seg_array[seg], each_count); + if (ret < 0) + { + pvfs2_error("%s: Failed to copy user buffer. Please make sure " + "that the pvfs2-client is running. %d\n", fnstr, ret); + goto out; + } + } + ret = service_operation(new_op, fnstr, + PVFS2_OP_RETRY_COUNT, get_interruptible_flag(inode)); if (ret < 0) { @@ -719,50 +726,48 @@ */ if (ret == -EINTR) { - pvfs2_print("pvfs2_file_readv: returning error %d\n", ret); + pvfs2_print("%s: returning error %d\n", fnstr, ret); } else { pvfs2_error( - "pvfs2_file_readv: error writing to handle %llu, " + "%s: error on handle %llu, " "FILE: %s\n -- returning %d\n", - llu(pvfs2_ino_to_handle(inode->i_ino)), + fnstr, llu(pvfs2_ino_to_handle(inode->i_ino)), (file && file->f_dentry && file->f_dentry->d_name.name ? (char *)file->f_dentry->d_name.name : "UNKNOWN"), ret); } - *offset = original_offset; - if (to_free) { - kfree(seg_array); - kfree(iovecptr); - } - return ret; + goto out; } - pvfs2_print("pvfs2_file_readv nr_segs %u, offset: %llu each_count:%d\n", - (int) seg_array[seg], *offset, (int) each_count); - /* - * copy data to application by pushing it out to the iovec. - * Number of segments to copy so that we don't - * overflow the block-size is set in seg_array[], and - * ptr points to the appropriate beginning of the - * iovec from where data needs to be copied to, and - * new_op->downcall.resp.io.amt_complete indicates - * the size in bytes that needs to be pushed out - */ - if (new_op->downcall.resp.io.amt_complete) + + if (type == IO_READV) { - ret = pvfs_bufmap_copy_to_user_iovec( - buffer_index, ptr, seg_array[seg], - new_op->downcall.resp.io.amt_complete); - if (ret < 0) + pvfs2_print("%s: nr_segs %u, offset: %llu each_count:%d\n", + fnstr, (int) seg_array[seg], *offset, (int) each_count); + /* + * copy data to application by pushing it out to the iovec. + * Number of segments to copy so that we don't + * overflow the block-size is set in seg_array[], and + * ptr points to the appropriate beginning of the + * iovec from where data needs to be copied to, and + * new_op->downcall.resp.io.amt_complete indicates + * the size in bytes that needs to be pushed out + */ + if (new_op->downcall.resp.io.amt_complete) { - pvfs2_error("Failed to copy user buffer. Please make sure " - "that the pvfs2-client is running.\n"); - /* put error codes in downcall so that handle_io_error() - * preserves it properly */ - new_op->downcall.status = ret; - handle_io_error(); - return(ret); + ret = pvfs_bufmap_copy_to_user_iovec(buffer_index, ptr, seg_array[seg], + new_op->downcall.resp.io.amt_complete); + if (ret < 0) + { + pvfs2_error("Failed to copy user buffer. Please make sure " + "that the pvfs2-client is running.\n"); + /* put error codes in downcall so that handle_io_error() + * preserves it properly */ + new_op->downcall.status = ret; + handle_io_error(); + goto out; + } } } /* advance the iovec pointer */ @@ -779,22 +784,54 @@ after this. */ wake_up_device_for_return(new_op); + new_op = NULL; pvfs_bufmap_put(buffer_index); + buffer_index = -1; - /* if we got a short write, fall out and return what we got so - * far + /* if we got a short I/O operations, + * fall out and return what we got so far */ if (amt_complete < each_count) { break; } } - - if (to_free) { + if (total_count > 0) + { + ret = total_count; + } +out: + if (new_op) + op_release(new_op); + if (buffer_index >= 0) + pvfs_bufmap_put(buffer_index); + if (to_free) + { kfree(iovecptr); kfree(seg_array); } - return total_count; + if (ret > 0 && file != NULL && inode != NULL) + { +#ifdef HAVE_TOUCH_ATIME + touch_atime(file->f_vfsmnt, file->f_dentry); +#else + update_atime(inode); +#endif + } + return ret; +} + + +/** Reads data to several contiguous user buffers (an iovec) from a file at a + * specified offset. + */ +static ssize_t pvfs2_file_readv( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *offset) +{ + return do_readv_writev(IO_READV, file, iov, nr_segs, offset); } @@ -807,248 +844,7 @@ unsigned long nr_segs, loff_t *offset) { - int ret = -1; - pvfs2_kernel_op_t *new_op = NULL; - struct iovec *iovecptr = NULL, *ptr = NULL; - loff_t original_offset = *offset; - int buffer_index = -1; - struct inode *inode = file->f_dentry->d_inode; - pvfs2_inode_t *pvfs2_inode = PVFS2_I(inode); - size_t amt_complete = 0; - size_t total_count = 0, count = 0, each_count = 0; - unsigned int seg, to_free = 0; - unsigned long new_nr_segs = 0, max_new_nr_segs = 0; - unsigned int seg_count, *seg_array = NULL; - - - /* Calculate the total length to write by adding up the length of each io - * segment */ - for (seg = 0; seg < nr_segs; seg++) - { - const struct iovec *iv = &iov[seg]; - count += iv->iov_len; - if (unlikely((ssize_t)(count|iv->iov_len) < 0)) - return -EINVAL; - if (total_count + iv->iov_len < pvfs_bufmap_size_query()) - { - total_count += iv->iov_len; - max_new_nr_segs++; - } - else { - total_count = (total_count + iv->iov_len - pvfs_bufmap_size_query()); - max_new_nr_segs+=2; - } - - } - /* perform generic linux kernel tests for sanity of write arguments */ - /* NOTE: this is particularly helpful in handling fsize rlimit properly */ -#ifdef PVFS2_LINUX_KERNEL_2_4 - ret = pvfs2_precheck_file_write(file, inode, &count, offset); -#else - ret = generic_write_checks(file, offset, &count, S_ISBLK(inode->i_mode)); -#endif - if (ret != 0 || count == 0) - { - pvfs2_print("pvfs2_file_writev: failed generic argument checks.\n"); - return(ret); - } - - total_count = 0; - /* - * if the total size of data transfer requested is greater than - * the kernel-set blocksize of PVFS2, then we split the iovecs - * such that no iovec description straddles this block size - * limitation. - */ - if (count > pvfs_bufmap_size_query()) - { - /* - * Split up the given iovec description such that - * no iovec descriptor straddles over the block-size limitation. - * This makes us our job easier to stage the I/O. - * In addition, this function will also compute an array with seg_count - * entries that will store the number of segments that straddle the - * block-size boundaries. - */ - if ((ret = split_iovecs(max_new_nr_segs, nr_segs, iov, /* IN */ - &new_nr_segs, &iovecptr, /* OUT */ - &seg_count, &seg_array) /* OUT */ ) < 0) - { - pvfs2_error("Failed to split iovecs to satisfy larger than blocksize writev request %d\n", ret); - return ret; - } - pvfs2_print("pvfs_file_writev: Splitting iovecs from %lu to %lu [max_new %lu]\n", - nr_segs, new_nr_segs, max_new_nr_segs); - /* We must free seg_array and iovecptr */ - to_free = 1; - } - else { - /* Number of segments dont change! */ - new_nr_segs = nr_segs; - /* use the given iovec description */ - iovecptr = (struct iovec *) iov; - /* There is only 1 element in the seg_array */ - seg_count = 1; - /* and its value is the number of segments passed in */ - seg_array = (unsigned int *) &nr_segs; - /* We dont have to free up anything */ - to_free = 0; - } - ptr = iovecptr; - pvfs2_print("pvfs2_file_writev writing [EMAIL PROTECTED]", (int) count, *offset); - pvfs2_print("pvfs2_file_writev: new_nr_segs: %lu, seg_count: %u\n", - new_nr_segs, seg_count); - for (seg = 0; seg < new_nr_segs; seg++) - { - pvfs2_print("pvfs2_file_writev: %d) %p to %p [%d bytes]\n", - seg + 1, iovecptr[seg].iov_base, - iovecptr[seg].iov_base + iovecptr[seg].iov_len, - (int) iovecptr[seg].iov_len); - } - for (seg = 0; seg < seg_count; seg++) - { - pvfs2_print("pvfs2_file_writev: %d) %u\n", seg + 1, seg_array[seg]); - } - seg = 0; - while (total_count < count) - { - new_op = op_alloc(); - if (!new_op) - { - *offset = original_offset; - if (to_free) { - kfree(iovecptr); - kfree(seg_array); - } - return -ENOMEM; - } - - new_op->upcall.type = PVFS2_VFS_OP_FILE_IO; - new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO; /* synchronous I/O */ - new_op->upcall.req.io.io_type = PVFS_IO_WRITE; - new_op->upcall.req.io.refn = pvfs2_inode->refn; - - ret = pvfs_bufmap_get(&buffer_index); - if (ret < 0) - { - pvfs2_error("pvfs2_file_writev: pvfs_bufmap_get() " - "failure (%d)\n", ret); - op_release(new_op); - *offset = original_offset; - if (to_free) { - kfree(iovecptr); - kfree(seg_array); - } - return ret; - } - - /* how much to transfer in this loop iteration */ - each_count = (((count - total_count) > pvfs_bufmap_size_query()) ? - pvfs_bufmap_size_query() : (count - total_count)); - - new_op->upcall.req.io.buf_index = buffer_index; - new_op->upcall.req.io.count = each_count; - new_op->upcall.req.io.offset = *offset; - pvfs2_print("pvfs2_file_writev nr_segs %u, offset: %llu each_count: %d\n", - seg_array[seg], *offset, (int) each_count); - - /* - * copy data from application by pulling it out of the iovec. - * Number of segments to copy so that we don't overflow the block-size - * is set in seg_array[], and ptr points to the appropriate - * beginning of the iovec from where data needs to be copied out, - * and each_count indicates the size in bytes that needs to be pulled - * out. */ - ret = pvfs_bufmap_copy_iovec_from_user( - buffer_index, ptr, seg_array[seg], each_count); - if (ret < 0) - { - pvfs2_error("Failed to copy user buffer. Please make sure " - "that the pvfs2-client is running. %d\n", ret); - op_release(new_op); - pvfs_bufmap_put(buffer_index); - *offset = original_offset; - if (to_free) { - kfree(seg_array); - kfree(iovecptr); - } - return ret; - } - - ret = service_operation( - new_op, "pvfs2_file_writev", PVFS2_OP_RETRY_COUNT, - get_interruptible_flag(inode)); - - if (ret < 0) - { - /* this macro is defined in pvfs2-kernel.h */ - handle_io_error(); - - /* - don't write an error to syslog on signaled operation - termination unless we've got debugging turned on, as - this can happen regularly (i.e. ctrl-c) - */ - if (ret == -EINTR) - { - pvfs2_print("pvfs2_file_writev: returning error %d\n", ret); - } - else - { - pvfs2_error( - "pvfs2_file_writev: error writing to handle %llu, " - "FILE: %s\n -- returning %d\n", - llu(pvfs2_ino_to_handle(inode->i_ino)), - (file && file->f_dentry && file->f_dentry->d_name.name ? - (char *)file->f_dentry->d_name.name : "UNKNOWN"), - ret); - } - *offset = original_offset; - if (to_free) { - kfree(seg_array); - kfree(iovecptr); - } - return ret; - } - /* advance the iovec pointer */ - ptr += seg_array[seg]; - seg++; - *offset += new_op->downcall.resp.io.amt_complete; - total_count += new_op->downcall.resp.io.amt_complete; - amt_complete = new_op->downcall.resp.io.amt_complete; - - /* - tell the device file owner waiting on I/O that this read has - completed and it can return now. in this exact case, on - wakeup the device will free the op, so we *cannot* touch it - after this. - */ - wake_up_device_for_return(new_op); - pvfs_bufmap_put(buffer_index); - - /* if we got a short write, fall out and return what we got so - * far TODO: define semantics here- kind of depends on pvfs2 - * semantics that don't really exist yet - */ - if (amt_complete < each_count) - { - break; - } - } - - if (to_free) { - kfree(iovecptr); - kfree(seg_array); - } - if (total_count) - { -#ifdef HAVE_TOUCH_ATIME - touch_atime(file->f_vfsmnt, file->f_dentry); -#else - update_atime(inode); -#endif - } - return total_count; + return do_readv_writev(IO_WRITEV, file, iov, nr_segs, offset); } #ifdef HAVE_AIO_VFS_SUPPORT Index: src/kernel/linux-2.6/pvfs2-kernel.h =================================================================== RCS file: /anoncvs/pvfs2/src/kernel/linux-2.6/pvfs2-kernel.h,v retrieving revision 1.116 diff -u -r1.116 pvfs2-kernel.h --- src/kernel/linux-2.6/pvfs2-kernel.h 19 May 2006 21:37:11 -0000 1.116 +++ src/kernel/linux-2.6/pvfs2-kernel.h 26 May 2006 02:59:21 -0000 @@ -752,8 +752,9 @@ { \ wake_up_device_for_return(new_op); \ } \ + new_op = NULL; \ pvfs_bufmap_put(buffer_index); \ - *offset = original_offset; \ + buffer_index = -1; \ } while(0) #ifdef HAVE_AIO_VFS_SUPPORT
_______________________________________________ Pvfs2-developers mailing list Pvfs2-developers@beowulf-underground.org http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers