Re: [PATCH 3/3] change sematics of read flag

2005-02-01 Thread Mitch Williams


On Tue, 1 Feb 2005, Greg KH wrote:

>
> Ick, no.  Pulled back out, as it doesn't even compile :(
>

Ick, indeed.  Yet another victim of my sneaky attempt to whack one large
patch up into three small patches.  Let's just forget this patch ever
existed, since it doesn't add any new functionality and probably shrinks
the running kernel by a good three bytes.

And I'll quit trying to be sneaky.

Thanks again for your help and patience on this stuff.

-Mitch
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] reject sysfs writes with nonzero position

2005-02-01 Thread Mitch Williams
This patch causes an error to be returned if the user attempts to write a
sysfs file at a nonzero offset.  Because sysfs store methods do not
support a position parameter, the behavior in this case would be
unpredictable.  Thus, we return an error, because at least now we're
consistent.

This patch generated from 2.6.11-rc1.

Now with tabs.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-25 10:47:15.0 -0800
@@ -232,6 +232,8 @@ sysfs_write_file(struct file *file, cons
 {
struct sysfs_buffer * buffer = file->private_data;

+   if (*ppos > 0)
+   return -EIO;
down(&buffer->sem);
count = fill_write_buffer(buffer,buf,count);
if (count > 0)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] disallow seeks and appends on sysfs files

2005-01-21 Thread Mitch Williams
This patch causes sysfs to return errors if the caller attempts to append
to or seek on a sysfs file.

Generated from 2.6.11-rc1.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -uprN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-21 13:09:21.0 -0800
@@ -281,6 +281,11 @@ static int check_perm(struct inode * ino
if (!ops)
goto Eaccess;

+   /* Is the file is open for append?  Sorry, we don't do that. */
+   if (file->f_flags & O_APPEND) {
+   goto Einval;
+   }
+
/* File needs write support.
 * The inode's perms must say it's ok,
 * and we must have a store method.
@@ -312,6 +302,10 @@ static int check_perm(struct inode * ino
file->private_data = buffer;
} else
error = -ENOMEM;
+
+   /*  Set mode bits to disallow seeking.  */
+   file->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
goto Done;

  Einval:
@@ -368,7 +343,7 @@ static int sysfs_release(struct inode *
 struct file_operations sysfs_file_operations = {
.read   = sysfs_read_file,
.write  = sysfs_write_file,
-   .llseek = generic_file_llseek,
+   .llseek = no_llseek,
.open   = sysfs_open_file,
.release= sysfs_release,
 };
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] buffer writes to sysfs

2005-01-21 Thread Mitch Williams
This patch buffers writes to sysfs files and flushes them to the kobject
owner when the file is closed.

Generated from 2.6.11-rc1.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -uprN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-21 13:09:21.0 -0800
@@ -62,6 +62,7 @@ struct sysfs_buffer {
struct sysfs_ops* ops;
struct semaphoresem;
int read_filled;
+   int needs_write_flush;
 };


@@ -178,7 +178,8 @@ out:
  */

 static int
-fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, 
size_t count)
+fill_write_buffer(struct sysfs_buffer *buffer, const char __user * buf,
+ size_t count, size_t pos)
 {
int error;

@@ -187,10 +189,11 @@ fill_write_buffer(struct sysfs_buffer *
if (!buffer->page)
return -ENOMEM;

-   if (count >= PAGE_SIZE)
-   count = PAGE_SIZE - 1;
+   if (count + pos > PAGE_SIZE)
+   count = (PAGE_SIZE - 1) - pos;
error = copy_from_user(buffer->page,buf,count);
-   buffer->needs_read_fill = 1;
+   buffer->needs_write_flush = 1;
+buffer->count = pos + count;
return error ? -EFAULT : count;
 }

@@ -206,13 +209,13 @@ fill_write_buffer(struct sysfs_buffer *
  */

 static int
-flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, 
size_t count)
+flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
struct attribute * attr = to_attr(dentry);
struct kobject * kobj = to_kobj(dentry->d_parent);
struct sysfs_ops * ops = buffer->ops;

-   return ops->store(kobj,attr,buffer->page,count);
+   return ops->store(kobj,attr, buffer->page, buffer->count);
 }


@@ -225,12 +228,9 @@ flush_write_buffer(struct dentry * dentr
  *
  * Similar to sysfs_read_file(), though working in the opposite direction.
  * We allocate and fill the data from the user in fill_write_buffer(),
- * then push it to the kobject in flush_write_buffer().
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come
- * on the first write.
- * Hint: if you're writing a value, first read the file, modify only the
- * the value you're changing, then write entire buffer back.
+ * but don't push it to the kobject until the file is closed.
+ *  This allows for buffered writes, but unfortunately also hides error
+ *  codes returned individual store functions until close time.
  */

 static ssize_t
@@ -238,10 +238,10 @@ sysfs_write_file(struct file *file, cons
 {
struct sysfs_buffer * buffer = file->private_data;

+   if (*ppos >= PAGE_SIZE)
+   return -ENOSPC;
down(&buffer->sem);
-   count = fill_write_buffer(buffer,buf,count);
-   if (count > 0)
-   count = flush_write_buffer(file->f_dentry,buffer,count);
+   count = fill_write_buffer(buffer, buf, count, *ppos);
if (count > 0)
*ppos += count;
up(&buffer->sem);
@@ -337,6 +337,17 @@ static int sysfs_release(struct inode *
struct attribute * attr = to_attr(filp->f_dentry);
struct module * owner = attr->owner;
struct sysfs_buffer * buffer = filp->private_data;
+int ret;
+
+   /* If data has been written to the file, then flush it
+* back to the kobject's store function here.
+*/
+   if (buffer && kobj) {
+   down(&buffer->sem);
+   if (buffer->needs_write_flush)
+   ret = flush_write_buffer(filp->f_dentry, buffer);
+   up(&buffer->sem);
+   }

if (kobj)
kobject_put(kobj);
@@ -348,7 +352,10 @@ static int sysfs_release(struct inode *
free_page((unsigned long)buffer->page);
kfree(buffer);
}
-   return 0;
+   /* If flush_write_buffer returned an error, pass it up.
+* Otherwise, return success.
+*/
+   return (ret < 0 ? ret : 0);
 }

 struct file_operations sysfs_file_operations = {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] change sematics of read flag

2005-01-21 Thread Mitch Williams
This patch reverses the semantics of the read fill flag, getting rid of an
extra assignment at allocation time.

Generated from 2.6.11-rc1.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -uprN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-21 13:09:21.0 -0800
@@ -61,7 +61,7 @@ struct sysfs_buffer {
char* page;
struct sysfs_ops* ops;
struct semaphoresem;
-   int needs_read_fill;
+   int read_filled;
 };


@@ -89,7 +89,7 @@ static int fill_read_buffer(struct dentr
return -ENOMEM;

count = ops->show(kobj,attr,buffer->page);
-   buffer->needs_read_fill = 0;
+   buffer->read_filled = 1;
BUG_ON(count > (ssize_t)PAGE_SIZE);
if (count >= 0)
buffer->count = count;
@@ -154,7 +154,7 @@ sysfs_read_file(struct file *file, char
ssize_t retval = 0;

down(&buffer->sem);
-   if (buffer->needs_read_fill) {
+   if (!buffer->read_filled) {
if ((retval = fill_read_buffer(file->f_dentry,buffer)))
goto out;
}
@@ -308,7 +307,6 @@ static int check_perm(struct inode * ino
if (buffer) {
memset(buffer,0,sizeof(struct sysfs_buffer));
init_MUTEX(&buffer->sem);
-   buffer->needs_read_fill = 1;
buffer->ops = ops;
file->private_data = buffer;
} else
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


re: sysfs patches

2005-01-21 Thread Mitch Williams
My apologies -- I appear to have sent the patches out in reverse order.
Please apply patch 3 before the other two.

This is the first time I've used our automated tools to make small patches
out of big ones, but I think I have it figured out now.

Thanks for your patience.

-Mitch Williams

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] disallow seeks and appends on sysfs files

2005-01-24 Thread Mitch Williams


On Sat, 22 Jan 2005, Greg KH wrote:

>
> On Fri, Jan 21, 2005 at 02:49:39PM -0800, Mitch Williams wrote:
> > This patch causes sysfs to return errors if the caller attempts to
> append
> > to or seek on a sysfs file.
>
> And what happens to it today if you do either of these?
>
> Also, isn't this two different things?
>
Appending and seeking are obviously two different operations, but the
result is the same to the sysfs file system.  Because the store method
doesn't have an offset argument, it must assume that all writes are based
from the beginning of the buffer.

So if your sysfs file contains "123" and you do
   echo "45" >> mysysfsfile
instead of the expected "12345", you end up with "45" in the file with no
errors.  Opening the file, seeking, and writing gives the same type of
behavior, with no errors.

This patch just sets a few flags to make sure that errors are returned
when this behavior is seen.  Logically then, the two "features" do the
same thing (set flags), and prevent the same behavior (writing wrong
contents without error).  Hence, I grouped them into one patch.

However, if you want two even simpler patches, I'm willing to comply.  Of
the three patches I sent, this is the most important to me.


>
> Please, no {} for one line if statements.  Like the one above it :)

I'll be glad to fix this and resubmit.  I prefer to not have braces
either, but I've seen a bunch of places in the kernel where people do it,
so I really wasn't sure which was right.  It's not really called out in
the coding style doc either.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] buffer writes to sysfs

2005-01-24 Thread Mitch Williams


On Sat, 22 Jan 2005, Greg KH wrote:

>
> On Fri, Jan 21, 2005 at 02:52:29PM -0800, Mitch Williams wrote:
> > This patch buffers writes to sysfs files and flushes them to the
> kobject
> > owner when the file is closed.
>
> Why?  This breaks the way things work today, right?
>
> What is this patch trying to fix?
>

To be honest, I'm a bit ambivalent about this patch.  I wrote this code in
response to a bug filed by our test lab.  If you write a bunch (e.g. > 1K)
of data to a sysfs file, the c library splits it up into multiple writes
of 1K or less.  Because the kobject store method doesn't support offsets,
and because the call to copy_from_user doesn't honor offsets, you end up
with multiple calls to the store method, with incorrect results and no
error code.

Thus, this patch, which coalesces all writes to the file (up to PAGE_SIZE)
into one write.

To the typical user, there's really no difference in behavior, unless you
are writing a ton of data into the file.  Of course, there's the obvious
question of why you'd want to do so...

If you don't like this patch, I would say that there are a couple of
alternatives:

First, we could fix the call to copy_from_user to honor offsets and allow
multiple writes of the same data to sysfs.  In other words, writing a long
block of data to a sysfs file would result in multiple calls to the store
method, each one containing more data in the buffer.  Thus, the first 1K
of data may be processed up to 4 times if the data fills the buffer.

Second, we could just have sysfs_write_file return an error anytime *ppos
is nonzero.  It's quick and dirty but at least the caller would get some
sort of error.

The patch as presented has the disadvantage of hiding possible error codes
from the store method util the file is closed, but has the advantage of
allowing > 1K writes with only a single call to the store method.  Would
it make a difference in the real world?  I don't know, but I know it'll
make our test lab guys happy.



> > +  * Otherwise, return success.
> > +  */
> > + return (ret < 0 ? ret : 0);
>
> I think this comment is not needed.  Actually, what's wrong with just:
> return ret;
In this case, ret could contain a positive result.  Since a lot of
user-space apps just look for a nonzero result from close(), it seemed
safer to suppres positive return codes in the case of success.

Thanks for your patience with this set of patches.

-Mitch
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] Disallow appends to sysfs files

2005-01-25 Thread Mitch Williams
This patch causes an error to be returned if the caller attempts to open a
sysfs file in append mode.

This patch applies cleanly to 2.6.11-rc1.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-24 16:26:21.0 -0800
@@ -275,6 +275,11 @@ static int check_perm(struct inode * ino
if (!ops)
goto Eaccess;

+   /* Is the file is open for append?  Sorry, we don't do that. */
+   if (file->f_flags & O_APPEND) {
+   goto Einval;
+   }
+
/* File needs write support.
 * The inode's perms must say it's ok,
 * and we must have a store method.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] Disallow seeks on sysfs files.

2005-01-25 Thread Mitch Williams
This patch causes an error return if the user attempts to seek on a sysfs
file.

The patch was generated from 2.6.11-rc1.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-25 10:28:25.0 -0800
@@ -307,6 +307,10 @@ static int check_perm(struct inode * ino
file->private_data = buffer;
} else
error = -ENOMEM;
+
+   /*  Set mode bits to disallow seeking.  */
+   file->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
goto Done;

  Einval:
@@ -349,7 +353,7 @@ static int sysfs_release(struct inode *
 struct file_operations sysfs_file_operations = {
.read   = sysfs_read_file,
.write  = sysfs_write_file,
-   .llseek = generic_file_llseek,
+   .llseek = no_llseek,
.open   = sysfs_open_file,
.release= sysfs_release,
 };
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] Disallow appends to sysfs files

2005-01-25 Thread Mitch Williams


On Tue, 25 Jan 2005, Josh Boyer wrote:
>
> Could you drop the braces?  Coding style thing.
>
Yeah, I'll drop and resubmit.  I've got so many different things going on
with these patches, I'm getting confused.  I know Greg asked for this same
change a while back but it got lost.

Resubmittal will commence in a few minutes.

-Mitch
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] buffer writes to sysfs

2005-01-25 Thread Mitch Williams


On Mon, 24 Jan 2005, Greg KH wrote:
>
> Who is trying to send > 1K to a sysfs file?  Remember, sysfs files are
> for 1 value only.  If you consider > 1K a "single value" please point me
> to that part of the kernel that does that.
>
> > To the typical user, there's really no difference in behavior, unless
> you
> > are writing a ton of data into the file.  Of course, there's the
> obvious
> > question of why you'd want to do so...
>
> Exactly, you should not be doing that anyway.  So, because of that, I
> really don't want/like this patch.


OK, I've had a day to think about this, and I think I have a good answer
now.

Leaving aside the issue of how big a 'single object' is, we still have to
consider the possibility that a user _will_ indeed someday try to write 4K
(or more) to a sysfs file.  It's just going to happen.  And right now, the
kernel's behavior in that event is unpredictable, because we don't know
how the c library is going to buffer this write.

Right now, on my FC3 box, writing a large amount of data to a sysfs file
results in multiple writes of 1K to the file.  What my store method sees
then is multiple calls, each with 1K of data.  Each time, I have to assume
what I see is the complete contents of the write, and I have to process it
as such.

So if my sysfs file contains FOO, and the user writes BAR followed by 3k
of garbage, I'm not going to end up with FOO, or even BAR, but I'll end up
with whatever garbage I see at the beginning of the third 1K write.

The real problem is not that I get wrong values -- my store method should
handle this -- but that there are no errors returned from this operation.
The only way the user can tell that something is wrong is if a) I write a
message to the log telling what I did in my store method, and b) the user
checks the log.

My original write buffering patch fixes this problem, and allows up to 4K
to be consolidated into a single call to the store method.  It doesn't
seem to affect normal operation of my test system (nor those in our test
lab), but does hide error code returns from store methods.  And I can see
why you would be disinclined to accept such a patch.

While we may want to consider the possibility that a 'single object' may
someday grow large (crypto key maybe?), I can live without write buffering
right now.

But at the very least, we still need to handle this failure case.   I've
tested the following patch and it does resolve the issue.  However, it now
limits the size of sysfs writes to the size of the c library's buffer.

----

This patch returns an error code if the user trys to write data to a sysfs
file at any offset other than 0.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-25 10:47:15.0 -0800
@@ -232,6 +232,8 @@ sysfs_write_file(struct file *file, cons
 {
struct sysfs_buffer * buffer = file->private_data;

+if (*ppos > 0)
+return -EIO;
down(&buffer->sem);
count = fill_write_buffer(buffer,buf,count);
if (count > 0)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Resubmittal [PATCH 1/2] Disallow appends to sysfs files

2005-01-25 Thread Mitch Williams
This patch returns an error code if the caller attempts to open a sysfs
file for appending.

Generated from 2.6.11-rc2.

Signed-off-by:  Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.11-clean/fs/sysfs/file.c 
linux-2.6.11/fs/sysfs/file.c
--- linux-2.6.11-clean/fs/sysfs/file.c  2004-12-24 13:33:50.0 -0800
+++ linux-2.6.11/fs/sysfs/file.c2005-01-25 14:59:33.0 -0800
@@ -275,6 +275,15 @@ static int check_perm(struct inode * ino
if (!ops)
goto Eaccess;

+/* Return error if the file is open for append.
+ * Sysfs can't support append because the kobject
+ * store methods don't take an offset into the buffer
+ * as an argument.  They end up thinking the appended
+ * data is the entire contents of the file.
+ */
+   if (file->f_flags & O_APPEND)
+   goto Einval;
+
/* File needs write support.
 * The inode's perms must say it's ok,
 * and we must have a store method.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2.6.21-rc4] Flush writes to MSI-X table

2007-03-22 Thread Mitch Williams
Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
enable/disable and rebalancing operations.  Because this is an expensive
operation, we do not perform the read flush after mask/unmask
operations.  Hardware which supports MSI-X typically also supports some
sort of interrupt moderation, so a read-flush is not necessary for
mask/unmask operations.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/i386/kernel/io_apic.c 
linux-2.6.21-rc4/arch/i386/kernel/io_apic.c
--- linux-2.6.21-rc4-clean/arch/i386/kernel/io_apic.c   2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/i386/kernel/io_apic.c 2007-03-19 16:24:05.0 
-0700
@@ -2594,6 +2594,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_ioapic_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/ia64/kernel/msi_ia64.c 
linux-2.6.21-rc4/arch/ia64/kernel/msi_ia64.c
--- linux-2.6.21-rc4-clean/arch/ia64/kernel/msi_ia64.c  2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/ia64/kernel/msi_ia64.c2007-03-19 
16:24:05.0 -0700
@@ -121,6 +121,8 @@ static int ia64_msi_retrigger_irq(unsign
  */
 static struct irq_chip ia64_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= ia64_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/ia64/sn/kernel/msi_sn.c 
linux-2.6.21-rc4/arch/ia64/sn/kernel/msi_sn.c
--- linux-2.6.21-rc4-clean/arch/ia64/sn/kernel/msi_sn.c 2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/ia64/sn/kernel/msi_sn.c   2007-03-19 
16:24:05.0 -0700
@@ -224,6 +224,8 @@ static int sn_msi_retrigger_irq(unsigned
 
 static struct irq_chip sn_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= sn_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/x86_64/kernel/io_apic.c 
linux-2.6.21-rc4/arch/x86_64/kernel/io_apic.c
--- linux-2.6.21-rc4-clean/arch/x86_64/kernel/io_apic.c 2007-03-19 
16:16:31.0 -0700
+++ linux-2.6.21-rc4/arch/x86_64/kernel/io_apic.c   2007-03-21 
12:44:51.0 -0700
@@ -1942,6 +1942,7 @@ static void set_msi_irq_affinity(unsigne
if (cpus_empty(tmp))
return;
 
+   msix_flush_writes(irq);
if (assign_irq_vector(irq, mask))
return;
 
@@ -1956,6 +1957,7 @@ static void set_msi_irq_affinity(unsigne
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -1966,6 +1968,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_apic_edge,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/drivers/pci/msi.c 
linux-2.6.21-rc4/drivers/pci/msi.c
--- linux-2.6.21-rc4-clean/drivers/pci/msi.c2007-03-19 16:16:32.0 
-0700
+++ linux-2.6.21-rc4/drivers/pci/msi.c  2007-03-21 12:44:51.0 -0700
@@ -68,6 +68,29 @@ static void msix_set_enable(struct pci_d
}
 }
 
+void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = get_irq_msi(irq);
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct ms

[PATCH 2.6.20.3] Flush writes to MSI-X table

2007-03-22 Thread Mitch Williams
Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
enable/disable and rebalancing operations.  Because this is an expensive
operation, we do not perform the read flush after mask/unmask
operations.  Hardware which supports MSI-X typically also supports some
sort of interrupt moderation, so a read-flush is not necessary for
mask/unmask operations.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.20.3-clean/arch/i386/kernel/io_apic.c 
linux-2.6.20.3/arch/i386/kernel/io_apic.c
--- linux-2.6.20.3-clean/arch/i386/kernel/io_apic.c 2007-02-04 
10:44:54.0 -0800
+++ linux-2.6.20.3/arch/i386/kernel/io_apic.c   2007-03-22 10:33:47.0 
-0700
@@ -2597,6 +2597,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_ioapic_irq,
diff -urpN -X dontdiff linux-2.6.20.3-clean/arch/ia64/kernel/msi_ia64.c 
linux-2.6.20.3/arch/ia64/kernel/msi_ia64.c
--- linux-2.6.20.3-clean/arch/ia64/kernel/msi_ia64.c2007-02-04 
10:44:54.0 -0800
+++ linux-2.6.20.3/arch/ia64/kernel/msi_ia64.c  2007-03-22 10:33:47.0 
-0700
@@ -116,6 +116,8 @@ static int ia64_msi_retrigger_irq(unsign
  */
 static struct irq_chip ia64_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= ia64_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.20.3-clean/arch/ia64/sn/kernel/msi_sn.c 
linux-2.6.20.3/arch/ia64/sn/kernel/msi_sn.c
--- linux-2.6.20.3-clean/arch/ia64/sn/kernel/msi_sn.c   2007-02-04 
10:44:54.0 -0800
+++ linux-2.6.20.3/arch/ia64/sn/kernel/msi_sn.c 2007-03-22 10:33:47.0 
-0700
@@ -216,6 +216,8 @@ static int sn_msi_retrigger_irq(unsigned
 
 static struct irq_chip sn_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= sn_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.20.3-clean/arch/x86_64/kernel/io_apic.c 
linux-2.6.20.3/arch/x86_64/kernel/io_apic.c
--- linux-2.6.20.3-clean/arch/x86_64/kernel/io_apic.c   2007-02-04 
10:44:54.0 -0800
+++ linux-2.6.20.3/arch/x86_64/kernel/io_apic.c 2007-03-22 10:36:03.0 
-0700
@@ -1923,6 +1923,7 @@ static void set_msi_irq_affinity(unsigne
 
cpus_and(mask, tmp, CPU_MASK_ALL);
 
+   msix_flush_writes(irq);
vector = assign_irq_vector(irq, mask, &tmp);
if (vector < 0)
return;
@@ -1937,6 +1938,7 @@ static void set_msi_irq_affinity(unsigne
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
set_native_irq_info(irq, mask);
 }
 #endif /* CONFIG_SMP */
@@ -1947,6 +1949,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_apic_edge,
diff -urpN -X dontdiff linux-2.6.20.3-clean/drivers/pci/msi.c 
linux-2.6.20.3/drivers/pci/msi.c
--- linux-2.6.20.3-clean/drivers/pci/msi.c  2007-02-04 10:44:54.0 
-0800
+++ linux-2.6.20.3/drivers/pci/msi.c2007-03-22 10:33:47.0 -0700
@@ -40,6 +40,29 @@ static int msi_cache_init(void)
return 0;
 }
 
+void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = msi_desc[irq];
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct msi_desc *entry;
@@ -161,6 +

[PATCH 2.6.21-rc5] MSI: read-flush MSI-X table

2007-03-26 Thread Mitch Williams
This patch fixes a kernel bug which is triggered when using the
irqbalance daemon with MSI-X hardware.

Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
enable/disable and rebalancing operations.  Because this is an expensive
operation, we do not perform the read flush after mask/unmask
operations.  Hardware which supports MSI-X typically also supports some
sort of interrupt moderation, so a read-flush is not necessary for
mask/unmask operations.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Generated from 2.6.21-rc4; applies cleanly to 2.6.21-rc5.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/i386/kernel/io_apic.c 
linux-2.6.21-rc4/arch/i386/kernel/io_apic.c
--- linux-2.6.21-rc4-clean/arch/i386/kernel/io_apic.c   2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/i386/kernel/io_apic.c 2007-03-19 16:24:05.0 
-0700
@@ -2594,6 +2594,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_ioapic_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/ia64/kernel/msi_ia64.c 
linux-2.6.21-rc4/arch/ia64/kernel/msi_ia64.c
--- linux-2.6.21-rc4-clean/arch/ia64/kernel/msi_ia64.c  2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/ia64/kernel/msi_ia64.c2007-03-19 
16:24:05.0 -0700
@@ -121,6 +121,8 @@ static int ia64_msi_retrigger_irq(unsign
  */
 static struct irq_chip ia64_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= ia64_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/ia64/sn/kernel/msi_sn.c 
linux-2.6.21-rc4/arch/ia64/sn/kernel/msi_sn.c
--- linux-2.6.21-rc4-clean/arch/ia64/sn/kernel/msi_sn.c 2007-03-19 
16:16:30.0 -0700
+++ linux-2.6.21-rc4/arch/ia64/sn/kernel/msi_sn.c   2007-03-19 
16:24:05.0 -0700
@@ -224,6 +224,8 @@ static int sn_msi_retrigger_irq(unsigned
 
 static struct irq_chip sn_msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.mask   = mask_msi_irq,
.unmask = unmask_msi_irq,
.ack= sn_ack_msi_irq,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/arch/x86_64/kernel/io_apic.c 
linux-2.6.21-rc4/arch/x86_64/kernel/io_apic.c
--- linux-2.6.21-rc4-clean/arch/x86_64/kernel/io_apic.c 2007-03-19 
16:16:31.0 -0700
+++ linux-2.6.21-rc4/arch/x86_64/kernel/io_apic.c   2007-03-21 
12:44:51.0 -0700
@@ -1942,6 +1942,7 @@ static void set_msi_irq_affinity(unsigne
if (cpus_empty(tmp))
return;
 
+   msix_flush_writes(irq);
if (assign_irq_vector(irq, mask))
return;
 
@@ -1956,6 +1957,7 @@ static void set_msi_irq_affinity(unsigne
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = mask;
 }
 #endif /* CONFIG_SMP */
@@ -1966,6 +1968,8 @@ static void set_msi_irq_affinity(unsigne
  */
 static struct irq_chip msi_chip = {
.name   = "PCI-MSI",
+   .enable = enable_msi_irq,
+   .disable= disable_msi_irq,
.unmask = unmask_msi_irq,
.mask   = mask_msi_irq,
.ack= ack_apic_edge,
diff -urpN -X dontdiff linux-2.6.21-rc4-clean/drivers/pci/msi.c 
linux-2.6.21-rc4/drivers/pci/msi.c
--- linux-2.6.21-rc4-clean/drivers/pci/msi.c2007-03-19 16:16:32.0 
-0700
+++ linux-2.6.21-rc4/drivers/pci/msi.c  2007-03-21 12:44:51.0 -0700
@@ -68,6 +68,29 @@ static void msix_set_enable(struct pci_d
}
 }
 
+void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = get_irq_msi(irq);
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;

[PATCH 2.6.21-rc5] MSI: read-flush MSI-X table (rev 2)

2007-03-29 Thread Mitch Williams
This patch fixes a kernel bug which is triggered when using the
irqbalance daemon with MSI-X hardware.

Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
mask/unmask and rebalancing operations.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Revised with input from Eric Biederman.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.21-rc5-clean/arch/i386/kernel/io_apic.c 
linux-2.6.21-rc5/arch/i386/kernel/io_apic.c
--- linux-2.6.21-rc5-clean/arch/i386/kernel/io_apic.c   2007-03-28 
10:05:22.0 -0700
+++ linux-2.6.21-rc5/arch/i386/kernel/io_apic.c 2007-03-28 10:19:14.0 
-0700
@@ -2584,6 +2584,7 @@ static void set_msi_irq_affinity(unsigne
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = mask;
 }
 #endif /* CONFIG_SMP */
diff -urpN -X dontdiff linux-2.6.21-rc5-clean/arch/ia64/kernel/msi_ia64.c 
linux-2.6.21-rc5/arch/ia64/kernel/msi_ia64.c
--- linux-2.6.21-rc5-clean/arch/ia64/kernel/msi_ia64.c  2007-03-28 
10:05:22.0 -0700
+++ linux-2.6.21-rc5/arch/ia64/kernel/msi_ia64.c2007-03-28 
09:34:27.0 -0700
@@ -60,6 +60,7 @@ static void ia64_set_msi_irq_affinity(un
msg.address_lo = addr;
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = cpu_mask;
 }
 #endif /* CONFIG_SMP */
diff -urpN -X dontdiff linux-2.6.21-rc5-clean/arch/ia64/sn/kernel/msi_sn.c 
linux-2.6.21-rc5/arch/ia64/sn/kernel/msi_sn.c
--- linux-2.6.21-rc5-clean/arch/ia64/sn/kernel/msi_sn.c 2007-03-28 
10:05:22.0 -0700
+++ linux-2.6.21-rc5/arch/ia64/sn/kernel/msi_sn.c   2007-03-28 
09:34:06.0 -0700
@@ -204,6 +204,7 @@ static void sn_set_msi_irq_affinity(unsi
msg.address_lo = (u32)(bus_addr & 0x);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = cpu_mask;
 }
 #endif /* CONFIG_SMP */
diff -urpN -X dontdiff linux-2.6.21-rc5-clean/arch/x86_64/kernel/io_apic.c 
linux-2.6.21-rc5/arch/x86_64/kernel/io_apic.c
--- linux-2.6.21-rc5-clean/arch/x86_64/kernel/io_apic.c 2007-03-28 
10:05:22.0 -0700
+++ linux-2.6.21-rc5/arch/x86_64/kernel/io_apic.c   2007-03-28 
10:18:52.0 -0700
@@ -1956,6 +1956,7 @@ static void set_msi_irq_affinity(unsigne
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
write_msi_msg(irq, &msg);
+   msix_flush_writes(irq);
irq_desc[irq].affinity = mask;
 }
 #endif /* CONFIG_SMP */
diff -urpN -X dontdiff linux-2.6.21-rc5-clean/drivers/pci/msi.c 
linux-2.6.21-rc5/drivers/pci/msi.c
--- linux-2.6.21-rc5-clean/drivers/pci/msi.c2007-03-28 10:05:24.0 
-0700
+++ linux-2.6.21-rc5/drivers/pci/msi.c  2007-03-28 09:21:34.0 -0700
@@ -68,6 +68,29 @@ static void msix_set_enable(struct pci_d
}
 }
 
+void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = get_irq_msi(irq);
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct msi_desc *entry;
@@ -186,11 +209,13 @@ void write_msi_msg(unsigned int irq, str
 void mask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 1);
+   msix_flush_writes(irq);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 0);
+   msix_flush_writes(irq);
 }
 
 static int msi_free_irq(struct pci_dev* dev, int irq);
diff -urpN -X dontdiff linux-2.6.21-rc5-clean/include/linux/msi.h 
linux-2.6.21-rc5/include/linux/msi.h
--- linux-2.6.21-rc5-clean/include/linux/msi.h  2007-03-28 10:05:25.0 
-0700
+++ linux-2.6.21-rc5/include/linux/msi.h2007-03-28 09:21:51.0 
-0700
@@ -12,6 +12,7 @@ extern void mask_msi_irq(unsigned int ir
 extern void unmask_msi_irq(unsigned int irq);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
+extern void msix_flush_writes(unsigned int irq);
 
 struct msi_desc {
struct {
-
To unsubscribe from this list: send the line "unsubscribe linux-kerne

[PATCH 2.6.21-rc5] Flush MSI-X table writes (rev 3)

2007-03-30 Thread Mitch Williams
This patch fixes a kernel bug which is triggered when using the
irqbalance daemon with MSI-X hardware.

Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
mask and unmask operations.  Since the SMP affinity is set while
the interrupt is masked, and since it's unmasked immediately after,
no additional flushes are required in the various affinity setting
routines.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Revised with input from Eric Biederman.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.21-rc5-clean/drivers/pci/msi.c 
linux-2.6.21-rc5/drivers/pci/msi.c
--- linux-2.6.21-rc5-clean/drivers/pci/msi.c2007-03-28 10:05:24.0 
-0700
+++ linux-2.6.21-rc5/drivers/pci/msi.c  2007-03-28 09:21:34.0 -0700
@@ -68,6 +68,29 @@ static void msix_set_enable(struct pci_d
}
 }
 
+static void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = get_irq_msi(irq);
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct msi_desc *entry;
@@ -186,11 +209,13 @@ void write_msi_msg(unsigned int irq, str
 void mask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 1);
+   msix_flush_writes(irq);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 0);
+   msix_flush_writes(irq);
 }
 
 static int msi_free_irq(struct pci_dev* dev, int irq);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2.6.20.4]

2007-03-30 Thread Mitch Williams
This patch fixes a kernel bug which is triggered when using the
irqbalance daemon with MSI-X hardware.

Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
mask and unmask operations.  Since the SMP affinity is set while
the interrupt is masked, and since it's unmasked immediately after,
no additional flushes are required in the various affinity setting
routines.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Revised with input from Eric Biederman.

diff -urpN -X dontdiff linux-2.6.20.4-clean/drivers/pci/msi.c 
linux-2.6.20.4/drivers/pci/msi.c
--- linux-2.6.20.4-clean/drivers/pci/msi.c  2007-02-04 10:44:54.0 
-0800
+++ linux-2.6.20.4/drivers/pci/msi.c2007-03-30 10:51:46.0 -0700
@@ -40,6 +40,29 @@ static int msi_cache_init(void)
return 0;
 }
 
+static void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = msi_desc[irq];
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct msi_desc *entry;
@@ -154,11 +177,13 @@ void write_msi_msg(unsigned int irq, str
 void mask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 1);
+   msix_flush_writes(irq);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 0);
+   msix_flush_writes(irq);
 }
 
 static int msi_free_irq(struct pci_dev* dev, int irq);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2.6.20.4]

2007-03-30 Thread Mitch Williams
On Fri, 2007-03-30 at 11:56 -0700, Mitch Williams wrote:
> This patch fixes a kernel bug which is triggered when using the
> irqbalance daemon with MSI-X hardware.
> 
Grrr.  Evolution cut-n-sometimes-paste feature bit me.  Will resend with
a proper subject line.
-Mitch
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2.6.20.4] Flush MSI-X table writes (rev 3)

2007-03-30 Thread Mitch Williams
This patch fixes a kernel bug which is triggered when using the
irqbalance daemon with MSI-X hardware.

Because both MSI-X interrupt messages and MSI-X table writes are posted,
it's possible for them to cross while in-flight.  This results in
interrupts being received long after the kernel thinks they're disabled,
and in interrupts being sent to stale vectors after rebalancing.

This patch performs a read flush after writes to the MSI-X table for
mask and unmask operations.  Since the SMP affinity is set while
the interrupt is masked, and since it's unmasked immediately after,
no additional flushes are required in the various affinity setting
routines.

This patch has been validated with (unreleased) network hardware which
uses MSI-X.

Revised with input from Eric Biederman.

Signed-off-by: Mitch Williams <[EMAIL PROTECTED]>

diff -urpN -X dontdiff linux-2.6.20.4-clean/drivers/pci/msi.c 
linux-2.6.20.4/drivers/pci/msi.c
--- linux-2.6.20.4-clean/drivers/pci/msi.c  2007-02-04 10:44:54.0 
-0800
+++ linux-2.6.20.4/drivers/pci/msi.c2007-03-30 10:51:46.0 -0700
@@ -40,6 +40,29 @@ static int msi_cache_init(void)
return 0;
 }
 
+static void msix_flush_writes(unsigned int irq)
+{
+   struct msi_desc *entry;
+
+   entry = msi_desc[irq];
+   BUG_ON(!entry || !entry->dev);
+   switch (entry->msi_attrib.type) {
+   case PCI_CAP_ID_MSI:
+   /* nothing to do */
+   break;
+   case PCI_CAP_ID_MSIX:
+   {
+   int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+   PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+   readl(entry->mask_base + offset);
+   break;
+   }
+   default:
+   BUG();
+   break;
+   }
+}
+
 static void msi_set_mask_bit(unsigned int irq, int flag)
 {
struct msi_desc *entry;
@@ -154,11 +177,13 @@ void write_msi_msg(unsigned int irq, str
 void mask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 1);
+   msix_flush_writes(irq);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
msi_set_mask_bit(irq, 0);
+   msix_flush_writes(irq);
 }
 
 static int msi_free_irq(struct pci_dev* dev, int irq);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/