Re: [take33 10/10] kevent: Kevent based AIO (aio_sendfile()/aio_sendfile_path()).

2007-01-17 Thread Suparna Bhattacharya
gt; +static int kaio_add_kevent(int fd, struct kaio_req *req)
> +{
> + struct ukevent uk;
> + struct file *file;
> + struct kevent_user *u;
> + int err, need_fput = 0;
> + u32 *cnt;
> +
> + file = fget_light(fd, &need_fput);
> + if (!file) {
> + err = -EBADF;
> + goto err_out;
> + }
> +
> + if (file->f_op != &kevent_user_fops) {
> + err = -EINVAL;
> + goto err_out_fput;
> + }
> +
> + u = file->private_data;
> +
> + memset(&uk, 0, sizeof(struct ukevent));
> +
> + uk.event = KEVENT_MASK_ALL;
> + uk.type = KEVENT_AIO;
> +
> + preempt_disable();
> + uk.id.raw[0] = smp_processor_id();
> + cnt = &__get_cpu_var(kaio_req_counter);
> + uk.id.raw[1] = *cnt;
> + *cnt = *cnt + 1;
> + preempt_enable();
> +
> + uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
> + uk.ptr = req;
> +
> + err = kevent_user_add_ukevent(&uk, u);
> + if (err)
> + goto err_out_fput;
> +
> + kevent_user_get(u);
> +
> + fput_light(file, need_fput);
> +
> + return 0;
> +
> +err_out_fput:
> + fput_light(file, need_fput);
> +err_out:
> + return err;
> +}
> +
> +static void kaio_destructor(struct kaio_req *req)
> +{
> + struct kaio_private *priv = req->priv;
> + struct socket *sock = priv->dptr;
> + struct file *file = priv->sptr;
> +
> + fput(file);
> + sockfd_put(sock);
> +
> + kevent_storage_ready(&priv->kevent_user->st, NULL, KEVENT_MASK_ALL);
> + kevent_user_put(priv->kevent_user);
> +
> + kmem_cache_free(kaio_priv_cache, req->priv);
> +}
> +
> +static struct kaio_req *kaio_sendfile(int kevent_fd, int sock_fd, struct 
> file *file, off_t offset, size_t count)
> +{
> + struct kaio_req *req;
> + struct socket *sock;
> + struct kaio_private *priv;
> + int err;
> +
> + sock = sockfd_lookup(sock_fd, &err);
> + if (!sock)
> + goto err_out_exit;
> +
> + priv = kmem_cache_alloc(kaio_priv_cache, GFP_KERNEL);
> + if (!priv)
> + goto err_out_sput;
> +
> + priv->sptr = file;
> + priv->dptr = sock;
> + priv->offset = offset;
> + priv->count = min_t(u64, i_size_read(file->f_dentry->d_inode), count);
> + priv->processed = offset;
> + priv->limit = 128;
> +
> + req = kaio_add_call(NULL, &kaio_read_send_pages, -1, GFP_KERNEL);
> + if (!req)
> + goto err_out_free;
> +
> + req->destructor = kaio_destructor;
> + req->priv = priv;
> +
> + err = kaio_add_kevent(kevent_fd, req);
> +
> + dprintk("%s: req: %p, priv: %p, call: %p [%p], offset: %Lu, processed: 
> %Lu, count: %Lu, err: %d.\n",
> + __func__, req, priv, &kaio_read_send_pages,
> + kaio_read_send_pages, priv->offset, priv->processed, 
> priv->count, err);
> +
> + if (err)
> + goto err_out_remove;
> +
> + kaio_schedule_req(req);
> +
> + return req;
> +
> +err_out_remove:
> + /* It is safe to just free the object since it is guaranteed that it 
> was not
> +  * queued for processing.
> +  */
> + kmem_cache_free(kaio_req_cache, req);
> +err_out_free:
> + kmem_cache_free(kaio_priv_cache, priv);
> +err_out_sput:
> + sockfd_put(sock);
> +err_out_exit:
> + return NULL;
> +
> +}
> +
> +asmlinkage long sys_aio_sendfile(int kevent_fd, int sock_fd, int in_fd, 
> off_t offset, size_t count)
> +{
> + struct kaio_req *req;
> + struct file *file;
> + int err;
> +
> + file = fget(in_fd);
> + if (!file) {
> + err = -EBADF;
> + goto err_out_exit;
> + }
> +
> + req = kaio_sendfile(kevent_fd, sock_fd, file, offset, count);
> + if (!req) {
> + err = -EINVAL;
> + goto err_out_fput;
> + }
> +
> + return (long)req;
> +
> +err_out_fput:
> + fput(file);
> +err_out_exit:
> + return err;
> +}
> +
> +asmlinkage long sys_aio_sendfile_path(int kevent_fd, int sock_fd, char 
> __user *filename, off_t offset, size_t count)
> +{
> + char *tmp = getname(filename);
> + int fd = PTR_ERR(tmp);
> + int flags = O_RDONLY, err;
> + struct nameidata nd;
> + struct file *file;
> + struct kaio_req *req;
> +
> + if (force_o_largefile())
> +     flags = O_LARGEFILE;
> +
> + if (IS_ERR(tmp)) {
> +   

Re: [take33 10/10] kevent: Kevent based AIO (aio_sendfile()/aio_sendfile_path()).

2007-01-18 Thread Suparna Bhattacharya
On Wed, Jan 17, 2007 at 05:39:51PM +0300, Evgeniy Polyakov wrote:
> On Wed, Jan 17, 2007 at 07:21:42PM +0530, Suparna Bhattacharya ([EMAIL 
> PROTECTED]) wrote:
> >
> > Since you are implementing new APIs here, have you considered doing an
> > aio_sendfilev to be able to send a header with the data ?
> 
> It is doable, but why people do not like corking?
> With Linux less than microsecond syscall overhead it is better and more
> flexible solution, doesn't it?

That is what I used to think as well. However ...

The problem as I understand it now is not about bunching data together, but
of ensuring some sort of atomicity between the header and the data, when
there can be multiple outstanding aio requests on the same socket - i.e
ensuring strict ordering without other data coming in between, when data
to be sent is not already in cache, and in the meantime another sendfile
or aio write requests comes in for the same socket. Without having to lock
the socket when reading data from disk.

There are alternate ways to address this, aio_sendfilev is one of the options
I have heard people requesting.

Regards
Suparna

> 
> I'm not saying - 'no, there will not be any *v variants', just getting
> more info.
> 
> > Regards
> > Suparna
> 
> --
>   Evgeniy Polyakov

-- 
Suparna Bhattacharya ([EMAIL PROTECTED])
Linux Technology Center
IBM Software Lab, India

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3/4] kevent: AIO, aio_sendfile() implementation.

2006-07-31 Thread Suparna Bhattacharya
On Thu, Jul 27, 2006 at 11:44:23AM -0700, Ulrich Drepper wrote:
> Badari Pulavarty wrote:
> > Before we spend too much time cleaning up and merging into mainline -
> > I would like an agreement that what we add is good enough for glibc
> > POSIX AIO.
> 
> I haven't seen a description of the interface so far.  Would be good if

Did Sébastien's mail with the description help ? 

> it existed.  But I briefly mentioned one quirk in the interface about
> which Suparna wasn't sure whether it's implemented/implementable in the
> current interface.
> 
> If a lio_listio call is made the individual requests are handle just as
> if they'd be issue separately.  I.e., the notification specified in the
> individual aiocb is performed when the specific request is done.  Then,
> once all requests are done, another notification is made, this time
> controlled by the sigevent parameter if lio_listio.

Looking at the code in lio kernel patch, this should be already covered:

if (iocb->ki_signo)
__aio_send_signal(iocb);

+   if (iocb->ki_lio)
+   lio_check(iocb->ki_lio);

That is, it first checks the notification in the individual iocb, and then
the one for the LIO.

> 
> 
> Another feature which I always wanted: the current lio_listio call
> returns in blocking mode only if all requests are done.  In non-blocking
> mode it returns immediately and the program needs to poll the aiocbs.
> What is needed is something in the middle.  For instance, if multiple
> read requests are issued the program might be able to start working as
> soon as one request is satisfied.  I.e., a call similar to lio_listio
> would be nice which also takes another parameter specifying how many of
> the NENT aiocbs have to finish before the call returns.

I imagine the kernel could enable this by incorporating this additional
parameter for IOCB_CMD_GROUP in the ABI (in the default case this should be the
same as the total number of iocbs submitted to lio_listio). Now should the
at least NENT check apply only to LIO_WAIT or also to the LIO_NOWAIT
notification case ? 

BTW, the native io_getevents does support a min_nr wakeup already, except that
it applies to any iocb on the io_context, and not just a given lio_listio call.

Regards
Suparna


-- 
Suparna Bhattacharya ([EMAIL PROTECTED])
Linux Technology Center
IBM Software Lab, India

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Kernel patches enabling better POSIX AIO (Was Re: [3/4] kevent: AIO, aio_sendfile)

2006-08-12 Thread Suparna Bhattacharya
in include/linux/aio_abi.h
> > 
> >   A struct lio_event is added in include/linux/aio.h
> > 
> >   A struct lio_event *ki_lio is added to struct iocb in include/linux/aio.h
> 
> So you have a pointer in the structure for the individual requests.  I
> assume you use the atomic counter to trigger the final delivery.  I
> further assume that if lio_wait is set the calling thread is suspended
> until all requests are handled and that the final notification in this
> case means that thread gets woken.
> 
> This is all fine.
> 
> But how do you pass the requests to the kernel?  If you have a new
> lio_listio-like syscall it'll be easy.  But I haven't seen anything like
> this mentioned.
> 
> The alternative is to pass the requests one-by-one in which case I don't
> see how you create the reference to the lio_listio control block.  This
> approach seems to be slower.

The way it works (and better ideas are welcome) is that, since the io_submit()
syscall already accepts an array of iocbs[], no new syscall was introduced.
To implement lio_listio, one has to set up such an array, with the first iocb
in the array having the special (new) grouping opcode of IOCB_CMD_GROUP which
specifies the sigev notification to be associated with group completion
(a NULL value of the sigev notification pointer would imply equivalent of
LIO_WAIT). The following iocbs in the array should correspond to the set of
listio aiocbs. Whenever it encounters an IOCB_CMD_GROUP iocb opcode, the
kernel would interpret all subsequent iocbs[] submitted in the same
io_submit() call to be associated with the same lio control block. 

Does that clarify ?

Would an example help ?

> 
> If all requests are passed at once, do you have the equivalent of
> LIO_NOP entries?
> 

Good question - we do have an IOCB_CMD_NOOP defined, and I seem to even
recall a patch that implemented it, but am wondering if it ever got merged.
Ben/Zach ?

> 
> How can we support the extension where we wait for a number of requests
> which need not be all of them.  I.e., I submit N requests and want to be
> notified when at least M (M <= N) notified.  I am not yet clear about
> the actual semantics we should implement (e.g., do we send another
> notification after the first one?) but it's something which IMO should
> be taken into account in the design.
> 

My thought here was that it should be possible to include M as a parameter
to the IOCB_CMD_GROUP opcode iocb, and thus incorporated in the lio control
block ... then whatever semantics are agreed upon can be implemented.

> 
> Finally, and this is very important, does you code send out the
> individual requests notification and then in the end the lio_listio
> completion?  I think Suparna wrote this is the case but I want to make sure.

Sebestian, could you confirm ?

> 
> 
> Overall, this looks much better than the old code.  If the answers to my
> questions show that the behavior is compatible with the POSIX AIO code
> I'm certainly very much in favor of adding the kernel code.

Thanks a lot for looking through this !
Let us know what you think about the listio interface ... hopefully the
other issues are mostly simple to resolve.

Regards
Suparna

> 
> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
> 



-- 
Suparna Bhattacharya ([EMAIL PROTECTED])
Linux Technology Center
IBM Software Lab, India

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel patches enabling better POSIX AIO (Was Re: [3/4] kevent: AIO, aio_sendfile)

2006-08-14 Thread Suparna Bhattacharya
On Sat, Aug 12, 2006 at 12:10:35PM -0700, Ulrich Drepper wrote:
> Suparna Bhattacharya wrote:
> > I am wondering about that too. IIRC, the IO_NOTIFY_* constants are not
> > part of the ABI, but only internal to the kernel implementation. I think
> > Zach had suggested inferring THREAD_ID notification if the pid specified
> > is not zero. But, I don't see why ->sigev_notify couldn't used directly
> > (just like the POSIX timers code does) thus doing away with the 
> > new constants altogether. Sebestian/Laurent, do you recall?
> 
> I suggest to model the implementation after the timer code which does
> exactly what we need.

Agreed.

> 
> 
> > I'm guessing they are being used for validation of permissions at the time
> > of sending the signal, but maybe saving the task pointer in the iocb instead
> > of the pid would suffice ?
> 
> Why should any verification be necessary?  The requests are generated in
> the same process which will receive the notification.  Even if the POSIX
> process (aka, kernel process group) changes the IDs the notifications
> should be set.  The key is that notifications cannot be sent to another
> POSIX process.

Is there a (remote) possibility that the thread could have died and its
pid got reused by a new thread in another process ? Or is there a mechanism
that prevents such a possibility from arising (not just in NPTL library,
but at the kernel level) ?

I think the timer code saves a reference to the task pointer instead of
the pid, which is what I was suggesting above (instead of the euid checks),
as way to avoid the above situation.

> 
> Adding this as a feature just makes things so much more complicated.
> 
> 
> > So I think the
> > intended behaviour is as you describe it should be
> 
> Then the documentation needs to be adjusted.

*Nod*

> 
> 
> > The way it works (and better ideas are welcome) is that, since the 
> > io_submit()
> > syscall already accepts an array of iocbs[], no new syscall was introduced.
> > To implement lio_listio, one has to set up such an array, with the first 
> > iocb
> > in the array having the special (new) grouping opcode of IOCB_CMD_GROUP 
> > which
> > specifies the sigev notification to be associated with group completion
> > (a NULL value of the sigev notification pointer would imply equivalent of
> > LIO_WAIT).
> 
> OK, this seems OK.  We have to construct the iocb arrays dynamically anyway.
> 
> 
> > My thought here was that it should be possible to include M as a parameter
> > to the IOCB_CMD_GROUP opcode iocb, and thus incorporated in the lio control
> > block ... then whatever semantics are agreed upon can be implemented.
> 
> If you have room for the parameter this is fine.  For the beginning we
> can enforce the number to be the same as the total number of requests.
> 

Sounds good.

> 
> > Let us know what you think about the listio interface ... hopefully the
> > other issues are mostly simple to resolve.
> 
> It should be fine and I would support adding all this assuming the
> normal file support (as opposed to direct I/O only) is added, too.

OK. I updated my patchset against 2618-rc3 just after OLS.

> 
> 
> But I have one last question: sockets, pipes and the like are already
> supported, right?  If this is not the case we have a problem with the
> currently proposed  lio_listio interface.

AIO for pipes should not be a problem - Chris Mason had a patch, so we can
just bring it up to the current levels, possibly with some additional
improvements.

I'm not sure what would be the right thing to do for the sockets case. While
we could put together a patch for basic aio_read/write (based on the same
model used for files), given the whole ongoing kevent effort, its not yet
clear to me what would make the most sense ...  

Ben had a patch to do a fallback to kernel threads for AIO operations that
are not yet supported natively. I had some concerns about the approach, but
I guess he had intended it as an interim path for cases like this.

Suggestions would be much appreciated ?  DaveM, Ingo, Andrew ?

Regards
Suparna

> 
> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
> 



-- 
Suparna Bhattacharya ([EMAIL PROTECTED])
Linux Technology Center
IBM Software Lab, India

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html