date:20070423

On Mon, Apr 23, 2007 at 09:34:55PM -0700, Jeremy Fitzhardinge wrote:
> 
> Could you give netfront an overall review as well?  I know you're
> already pretty familiar with it, but if you could cast a fresh eye over
> it, that would be helpful.

Sure thing.  I'll look over it soon.

Actually there is one thing I'd like to see changed first up: I noticed
that you've stripped out the checksum hack which is in the main Xen tree.
We actually have the code in net-2.6.22 (which is also in mm) that lets
you use CHECKSUM_PARTIAL on received packets without having to do that
hack.

Here's the patch that I've been testing so far.  It's against the Xen
source, but should be easy to adapt to your version as well.

I just thought about this again, and in fact we need this change for
correctness as well as performance.  Because not setting ip_summed
to CHECKSUM_PARTIAL in netfront is not going to stop netback from
sending CHECKSUM_PARTIAL packets to us.  If these packets are then
routed/bridged back to netback, they'll have the wrong checksum.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff -ur linux-2.6.20.noarch/drivers/xen/core/skbuff.c 
linux-2.6.20.i686/drivers/xen/core/skbuff.c
--- linux-2.6.20.noarch/drivers/xen/core/skbuff.c   2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/core/skbuff.c 2007-03-30 21:06:20.0 
+1000
@@ -9,6 +9,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -78,6 +82,37 @@
return skb;
 }
 
+int skb_checksum_setup(struct sk_buff *skb)
+{
+   if (skb->protocol != htons(ETH_P_IP))
+   goto out;
+   skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
+   if (skb->h.raw >= skb->tail)
+   goto out;
+   switch (skb->nh.iph->protocol) {
+   case IPPROTO_TCP:
+   skb->csum_offset = offsetof(struct tcphdr, check);
+   break;
+   case IPPROTO_UDP:
+   skb->csum_offset = offsetof(struct udphdr, check);
+   break;
+   default:
+   if (net_ratelimit())
+   printk(KERN_ERR "Attempting to checksum a non-"
+  "TCP/UDP packet, dropping a protocol"
+  " %d packet", skb->nh.iph->protocol);
+   goto out;
+   }
+   if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
+   goto out;
+   skb->ip_summed = CHECKSUM_PARTIAL;
+
+   return 0;
+out:
+   return -EPROTO;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
 static void skbuff_ctor(void *buf, struct kmem_cache *cachep, unsigned long 
unused)
 {
int order = 0;
diff -ur linux-2.6.20.noarch/drivers/xen/netback/loopback.c 
linux-2.6.20.i686/drivers/xen/netback/loopback.c
--- linux-2.6.20.noarch/drivers/xen/netback/loopback.c  2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/netback/loopback.c2007-03-30 
21:01:02.0 +1000
@@ -149,16 +149,6 @@
np->stats.rx_bytes += skb->len;
np->stats.rx_packets++;
 
-   if (skb->ip_summed == CHECKSUM_PARTIAL) {
-   /* Defer checksum calculation. */
-   skb->proto_csum_blank = 1;
-   /* Must be a local packet: assert its integrity. */
-   skb->proto_data_valid = 1;
-   }
-
-   skb->ip_summed = skb->proto_data_valid ?
-   CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
-
skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
skb->protocol = eth_type_trans(skb, dev);
skb->dev  = dev;
diff -ur linux-2.6.20.noarch/drivers/xen/netback/netback.c 
linux-2.6.20.i686/drivers/xen/netback/netback.c
--- linux-2.6.20.noarch/drivers/xen/netback/netback.c   2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/netback/netback.c 2007-03-31 
21:07:48.0 +1000
@@ -293,7 +293,6 @@
/* Copy only the header fields we use in this driver. */
nskb->dev = skb->dev;
nskb->ip_summed = skb->ip_summed;
-   nskb->proto_data_valid = skb->proto_data_valid;
dev_kfree_skb(skb);
skb = nskb;
}
@@ -666,9 +665,11 @@
id = meta[npo.meta_cons].id;
flags = nr_frags ? NETRXF_more_data : 0;
 
-   if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+   if (skb->ip_summed == CHECKSUM_PARTIAL)
+   /* local packet? */
flags |= NETRXF_csum_blank | NETRXF_data_validated;
-   else if (skb->proto_data_valid) /* remote but checksummed? */
+   else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+   /* remote but checksummed? */
flags

Re: BUG: Null pointer dereference in fs/open.c

On Tue, 24 Apr 2007 05:44:42 + (GMT) William Heimbigner <[EMAIL PROTECTED]> 
wrote:

> On Mon, 23 Apr 2007, Andrew Morton wrote:
> > On Tue, 24 Apr 2007 05:10:04 + (GMT) William Heimbigner <[EMAIL 
> > PROTECTED]> wrote:
> >
> >>> --- a/drivers/block/pktcdvd.c~packet-fix-error-handling
> >>> +++ a/drivers/block/pktcdvd.c
> >>> @@ -777,7 +777,8 @@ static int pkt_generic_packet(struct pkt
> >>>   rq->cmd_flags |= REQ_QUIET;
> >>>
> >>>   blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
> >>> - ret = rq->errors;
> >>> + if (rq->errors)
> >>> + ret = -EIO;
> >>> out:
> >>>   blk_put_request(rq);
> >>>   return ret;
> >>> _
> >>
> >> This patch fixes (or conceals?) the oops.
> >>
> >
> > Fixes.  But does the packet driver actually work OK for you?  Writes
> > files and stuff like that?
> >
> Short answer, no.
> 
> Long answer:
> # pktsetup 0 /dev/hdc
>
> ...
>
> [11508.520800] pktcdvd: pkt_get_last_written failed
> 
> # mkudffs /dev/pktcdvd/0
> [11539.953560] pktcdvd: pkt_get_last_written failed
> trying to change type of multiple extents
> 
> I get the same error with /dev/hdd as well (hdc and hdd are both dvd 
> burners, hdd has a cd-rw and hdc had a dvd-rw)

Yes, I get the same on a sata (piix) dvd burner.

We need to work out who is setting rq->errors and why - should be pretty
simple.  I'll take a look at that after I've nailed one of these other bugs
over here.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] openprom: Switch to ref counting PCI API

2007-04-23 Thread David Miller

From: Alan Cox <[EMAIL PROTECTED]>
Date: Mon, 23 Apr 2007 14:58:40 +0100

> Signed-off-by: Alan Cox <[EMAIL PROTECTED]>

Applied, thanks Alan.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] oom: kill all threads that share mm with killed task

On Mon, 23 Apr 2007, David Rientjes wrote:

> oom_kill_task() calls __oom_kill_task() to OOM kill a selected task.
> When finding other threads that share an mm with that task, we need to
> kill those individual threads and not the same one.

Obvious fix. It was broken by

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=f2a2a7108aa0039ba7a5fe7a0d2ecef2219a7584
Dec 7. So its in 2.6.20 and later. Candiate for stable?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] oom: kill all threads that share mm with killed task

2007-04-23 Thread William Lee Irwin III

On Mon, Apr 23, 2007 at 09:36:13PM -0700, David Rientjes wrote:
> oom_kill_task() calls __oom_kill_task() to OOM kill a selected task.
> When finding other threads that share an mm with that task, we need to
> kill those individual threads and not the same one.

ISTR shooting down something of this form a while back. I suppose it
got resurrected.

Acked-by: William Irwin <[EMAIL PROTECTED]>


-- wli
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

Benjamin Herrenschmidt <[EMAIL PROTECTED]> writes:

>> Further in general it doesn't make sense to grab a module reference
>> and call that sufficient because we would like to request that the
>> module exits.
>
> Which is, btw, I think a total misdesign of our module stuff, but heh, I
> remember that lead to some flamewars back then...
>
> Like anything else, modules should have separated the entrypoints for
>
>  - Initiating a removal request
>  - Releasing the module
>
> The former is use did "rmmod", can unregister things from subsystems,
> etc... (and can file if the driver decides to refuse removal requests
> when it's busy doing things or whatever policy that module wants to
> implement).
>
> The later is called when all references to the modules have been
> dropped, it's a bit like the kref "release" (and could be implemented as
> one).
>
> If we had done that (simple) thing back then, module refcounting would
> have been much less of a problem... I remember some reasons why that was
> veto'ed but I didn't and still don't agree.

The basic point is because a thread can terminate sooner if we have an
explicit request to stop, we need that in the design.

Because we need to find the threads to request that they stop we need to
have some way to track them.

Since we need to have some way to track them having an explicit data
structure that the callers manage seems to make sense.

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: BUG: Null pointer dereference in fs/open.c

2007-04-23 Thread William Heimbigner


On Mon, 23 Apr 2007, Andrew Morton wrote:

On Tue, 24 Apr 2007 05:10:04 + (GMT) William Heimbigner <[EMAIL PROTECTED]> 
wrote:


--- a/drivers/block/pktcdvd.c~packet-fix-error-handling
+++ a/drivers/block/pktcdvd.c
@@ -777,7 +777,8 @@ static int pkt_generic_packet(struct pkt
rq->cmd_flags |= REQ_QUIET;

blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
-   ret = rq->errors;
+   if (rq->errors)
+   ret = -EIO;
out:
blk_put_request(rq);
return ret;
_


This patch fixes (or conceals?) the oops.



Fixes.  But does the packet driver actually work OK for you?  Writes
files and stuff like that?


Short answer, no.

Long answer:
# pktsetup 0 /dev/hdc
[11508.006818] =
[11508.028248] [ INFO: possible recursive locking detected ]
[11508.044413] 2.6.21-rc7-git5 #23
[11508.053818] -
[11508.069989] vol_id/4315 is trying to acquire lock:
[11508.084332]  (>bd_mutex){--..}, at: [] 
do_open+0x4f/0x2c0

[11508.104867]
[11508.104868] but task is already holding lock:
[11508.122359]  (>bd_mutex){--..}, at: [] 
do_open+0x4f/0x2c0

[11508.142862]
[11508.142863] other info that might help us debug this:
[11508.162460] 2 locks held by vol_id/4315:
[11508.174212]  #0:  (>bd_mutex){--..}, at: [] 
do_open+0x4f/0x2c0
[11508.196066]  #1:  (_mutex#2){--..}, at: [] 
mutex_lock+0x1c/0x20

[11508.217720]
[11508.217721] stack backtrace:
[11508.230821]  [] show_trace_log_lvl+0x1a/0x30
[11508.246255]  [] show_trace+0x12/0x20
[11508.259619]  [] dump_stack+0x16/0x20
[11508.272974]  [] __lock_acquire+0xbc0/0x1040
[11508.288157]  [] lock_acquire+0x70/0x90
[11508.302035]  [] mutex_lock_nested+0x7e/0x2e0
[11508.317475]  [] do_open+0x4f/0x2c0
[11508.330314]  [] __blkdev_get+0x79/0x90
[11508.344189]  [] blkdev_get+0x15/0x20
[11508.357554]  [] pkt_open+0xb7/0xd80
[11508.370651]  [] do_open+0x85/0x2c0
[11508.383491]  [] blkdev_open+0x33/0x70
[11508.397107]  [] __dentry_open+0xf4/0x220
[11508.411509]  [] nameidata_to_filp+0x35/0x40
[11508.426684]  [] do_filp_open+0x49/0x50
[11508.440567]  [] do_sys_open+0x47/0xd0
[11508.454188]  [] sys_open+0x1c/0x20
[11508.467023]  [] sysenter_past_esp+0x5f/0x99
[11508.482202]  ===
[11508.520800] pktcdvd: pkt_get_last_written failed

# mkudffs /dev/pktcdvd/0
[11539.953560] pktcdvd: pkt_get_last_written failed
trying to change type of multiple extents

I get the same error with /dev/hdd as well (hdc and hdd are both dvd 
burners, hdd has a cd-rw and hdc had a dvd-rw)



William Heimbigner
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/12] get_unmapped_area handles MAP_FIXED on i386

Handle MAP_FIXED in i386 hugetlb_get_unmapped_area(), just call
prepare_hugepage_range.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 arch/i386/mm/hugetlbpage.c |6 ++
 1 file changed, 6 insertions(+)

Index: linux-cell/arch/i386/mm/hugetlbpage.c
===
--- linux-cell.orig/arch/i386/mm/hugetlbpage.c  2007-03-22 16:08:12.0 
+1100
+++ linux-cell/arch/i386/mm/hugetlbpage.c   2007-03-22 16:14:19.0 
+1100
@@ -367,6 +367,12 @@ hugetlb_get_unmapped_area(struct file *f
if (len > TASK_SIZE)
return -ENOMEM;
 
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(addr, len, pgoff))
+   return -EINVAL;
+   return addr;
+   }
+
if (addr) {
addr = ALIGN(addr, HPAGE_SIZE);
vma = find_vma(mm, addr);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/12] get_unmapped_area handles MAP_FIXED on frv

Handle MAP_FIXED in arch_get_unmapped_area on frv. Trivial case, just
return the address.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 arch/frv/mm/elf-fdpic.c |4 
 1 file changed, 4 insertions(+)

Index: linux-cell/arch/frv/mm/elf-fdpic.c
===
--- linux-cell.orig/arch/frv/mm/elf-fdpic.c 2007-03-22 15:00:50.0 
+1100
+++ linux-cell/arch/frv/mm/elf-fdpic.c  2007-03-22 15:01:06.0 +1100
@@ -64,6 +64,10 @@ unsigned long arch_get_unmapped_area(str
if (len > TASK_SIZE)
return -ENOMEM;
 
+   /* handle MAP_FIXED */
+   if (flags & MAP_FIXED)
+   return addr;
+
/* only honour a hint if we're not going to clobber something doing so 
*/
if (addr) {
addr = PAGE_ALIGN(addr);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 9/12] get_unmapped_area handles MAP_FIXED on x86_64

Handle MAP_FIXED in x86_64 arch_get_unmapped_area(), simple case, just
return the address as passed in

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 arch/x86_64/kernel/sys_x86_64.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux-cell/arch/x86_64/kernel/sys_x86_64.c
===
--- linux-cell.orig/arch/x86_64/kernel/sys_x86_64.c 2007-03-22 
16:10:10.0 +1100
+++ linux-cell/arch/x86_64/kernel/sys_x86_64.c  2007-03-22 16:11:06.0 
+1100
@@ -93,6 +93,9 @@ arch_get_unmapped_area(struct file *filp
unsigned long start_addr;
unsigned long begin, end;

+   if (flags & MAP_FIXED)
+   return addr;
+
find_start_end(flags, , ); 
 
if (len > end)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 8/12] get_unmapped_area handles MAP_FIXED on sparc64

Handle MAP_FIXED in hugetlb_get_unmapped_area on sparc64
by just using prepare_hugepage_range()

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 arch/sparc64/mm/hugetlbpage.c |6 ++
 1 file changed, 6 insertions(+)

Index: linux-cell/arch/sparc64/mm/hugetlbpage.c
===
--- linux-cell.orig/arch/sparc64/mm/hugetlbpage.c   2007-03-22 
16:12:57.0 +1100
+++ linux-cell/arch/sparc64/mm/hugetlbpage.c2007-03-22 16:15:33.0 
+1100
@@ -175,6 +175,12 @@ hugetlb_get_unmapped_area(struct file *f
if (len > task_size)
return -ENOMEM;
 
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(addr, len, pgoff))
+   return -EINVAL;
+   return addr;
+   }
+
if (addr) {
addr = ALIGN(addr, HPAGE_SIZE);
vma = find_vma(mm, addr);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 7/12] get_unmapped_area handles MAP_FIXED on parisc

Handle MAP_FIXED in parisc arch_get_unmapped_area(), just return the
address. We might want to also check for possible cache aliasing
issues now that we get called in that case (like ARM or MIPS),
leave a comment for the maintainers to pick up.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 arch/parisc/kernel/sys_parisc.c |5 +
 1 file changed, 5 insertions(+)

Index: linux-cell/arch/parisc/kernel/sys_parisc.c
===
--- linux-cell.orig/arch/parisc/kernel/sys_parisc.c 2007-03-22 
15:28:05.0 +1100
+++ linux-cell/arch/parisc/kernel/sys_parisc.c  2007-03-22 15:29:08.0 
+1100
@@ -106,6 +106,11 @@ unsigned long arch_get_unmapped_area(str
 {
if (len > TASK_SIZE)
return -ENOMEM;
+   /* Might want to check for cache aliasing issues for MAP_FIXED case
+* like ARM or MIPS ??? --BenH.
+*/
+   if (flags & MAP_FIXED)
+   return addr;
if (!addr)
addr = TASK_UNMAPPED_BASE;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/12] get_unmapped_area handles MAP_FIXED on ia64

Handle MAP_FIXED in ia64 arch_get_unmapped_area and
hugetlb_get_unmapped_area(), just call prepare_hugepage_range
in the later and is_hugepage_only_range() in the former.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 arch/ia64/kernel/sys_ia64.c |7 +++
 arch/ia64/mm/hugetlbpage.c  |8 
 2 files changed, 15 insertions(+)

Index: linux-cell/arch/ia64/kernel/sys_ia64.c
===
--- linux-cell.orig/arch/ia64/kernel/sys_ia64.c 2007-03-22 15:10:45.0 
+1100
+++ linux-cell/arch/ia64/kernel/sys_ia64.c  2007-03-22 15:10:47.0 
+1100
@@ -33,6 +33,13 @@ arch_get_unmapped_area (struct file *fil
if (len > RGN_MAP_LIMIT)
return -ENOMEM;
 
+   /* handle fixed mapping: prevent overlap with huge pages */
+   if (flags & MAP_FIXED) {
+   if (is_hugepage_only_range(mm, addr, len))
+   return -EINVAL;
+   return addr;
+   }
+
 #ifdef CONFIG_HUGETLB_PAGE
if (REGION_NUMBER(addr) == RGN_HPAGE)
addr = 0;
Index: linux-cell/arch/ia64/mm/hugetlbpage.c
===
--- linux-cell.orig/arch/ia64/mm/hugetlbpage.c  2007-03-22 15:12:32.0 
+1100
+++ linux-cell/arch/ia64/mm/hugetlbpage.c   2007-03-22 15:12:39.0 
+1100
@@ -148,6 +148,14 @@ unsigned long hugetlb_get_unmapped_area(
return -ENOMEM;
if (len & ~HPAGE_MASK)
return -EINVAL;
+
+   /* Handle MAP_FIXED */
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(addr, len, pgoff))
+   return -EINVAL;
+   return addr;
+   }
+
/* This code assumes that RGN_HPAGE != 0. */
if ((REGION_NUMBER(addr) != RGN_HPAGE) || (addr & (HPAGE_SIZE - 1)))
addr = HPAGE_REGION_BASE;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 10/12] get_unmapped_area handles MAP_FIXED in hugetlbfs

Generic hugetlb_get_unmapped_area() now handles MAP_FIXED by just
calling prepare_hugepage_range()

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 fs/hugetlbfs/inode.c |6 ++
 1 file changed, 6 insertions(+)

Index: linux-cell/fs/hugetlbfs/inode.c
===
--- linux-cell.orig/fs/hugetlbfs/inode.c2007-03-22 16:12:56.0 
+1100
+++ linux-cell/fs/hugetlbfs/inode.c 2007-03-22 16:16:02.0 +1100
@@ -115,6 +115,12 @@ hugetlb_get_unmapped_area(struct file *f
if (len > TASK_SIZE)
return -ENOMEM;
 
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(addr, len, pgoff))
+   return -EINVAL;
+   return addr;
+   }
+
if (addr) {
addr = ALIGN(addr, HPAGE_SIZE);
vma = find_vma(mm, addr);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 12/12] get_unmapped_area doesn't need hugetlbfs hacks anymore

Remove the hugetlbfs specific hacks in toplevel get_unmapped_area() now
that all archs and hugetlbfs itself do the right thing for both cases.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 mm/mmap.c |   16 
 1 file changed, 16 deletions(-)

Index: linux-cell/mm/mmap.c
===
--- linux-cell.orig/mm/mmap.c   2007-04-12 12:14:46.0 +1000
+++ linux-cell/mm/mmap.c2007-04-12 12:14:47.0 +1000
@@ -1381,22 +1381,6 @@ get_unmapped_area(struct file *file, uns
if (addr & ~PAGE_MASK)
return -EINVAL;
 
-   if (file && is_file_hugepages(file))  {
-   /*
-* Check if the given range is hugepage aligned, and
-* can be made suitable for hugepages.
-*/
-   ret = prepare_hugepage_range(addr, len, pgoff);
-   } else {
-   /*
-* Ensure that a normal request is not falling in a
-* reserved hugepage range.  For some archs like IA-64,
-* there is a separate region for hugepages.
-*/
-   ret = is_hugepage_only_range(current->mm, addr, len);
-   }
-   if (ret)
-   return -EINVAL;
return addr;
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/12] get_unmapped_area handles MAP_FIXED on arm

ARM already had a case for MAP_FIXED in arch_get_unmapped_area() though
it was not called before. Fix the comment to reflect that it will now
be called.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 arch/arm/mm/mmap.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: linux-cell/arch/arm/mm/mmap.c
===
--- linux-cell.orig/arch/arm/mm/mmap.c  2007-03-22 14:59:51.0 +1100
+++ linux-cell/arch/arm/mm/mmap.c   2007-03-22 15:00:01.0 +1100
@@ -49,8 +49,7 @@ arch_get_unmapped_area(struct file *filp
 #endif
 
/*
-* We should enforce the MAP_FIXED case.  However, currently
-* the generic kernel code doesn't allow us to handle this.
+* We enforce the MAP_FIXED case.
 */
if (flags & MAP_FIXED) {
if (aliasing && flags & MAP_SHARED && addr & (SHMLBA - 1))
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 11/12] get_unmapped_area handles MAP_FIXED in generic code

generic arch_get_unmapped_area() now handles MAP_FIXED. Now that
all implementations have been fixed, change the toplevel
get_unmapped_area() to call into arch or drivers for the MAP_FIXED
case.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 mm/mmap.c |   25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

Index: linux-cell/mm/mmap.c
===
--- linux-cell.orig/mm/mmap.c   2007-03-22 16:29:22.0 +1100
+++ linux-cell/mm/mmap.c2007-03-22 16:30:06.0 +1100
@@ -1199,6 +1199,9 @@ arch_get_unmapped_area(struct file *filp
if (len > TASK_SIZE)
return -ENOMEM;
 
+   if (flags & MAP_FIXED)
+   return addr;
+
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
@@ -1272,6 +1275,9 @@ arch_get_unmapped_area_topdown(struct fi
if (len > TASK_SIZE)
return -ENOMEM;
 
+   if (flags & MAP_FIXED)
+   return addr;
+
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
@@ -1360,22 +1366,21 @@ get_unmapped_area(struct file *file, uns
unsigned long pgoff, unsigned long flags)
 {
unsigned long ret;
+   unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
 
-   if (!(flags & MAP_FIXED)) {
-   unsigned long (*get_area)(struct file *, unsigned long, 
unsigned long, unsigned long, unsigned long);
-
-   get_area = current->mm->get_unmapped_area;
-   if (file && file->f_op && file->f_op->get_unmapped_area)
-   get_area = file->f_op->get_unmapped_area;
-   addr = get_area(file, addr, len, pgoff, flags);
-   if (IS_ERR_VALUE(addr))
-   return addr;
-   }
+   get_area = current->mm->get_unmapped_area;
+   if (file && file->f_op && file->f_op->get_unmapped_area)
+   get_area = file->f_op->get_unmapped_area;
+   addr = get_area(file, addr, len, pgoff, flags);
+   if (IS_ERR_VALUE(addr))
+   return addr;
 
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (addr & ~PAGE_MASK)
return -EINVAL;
+
if (file && is_file_hugepages(file))  {
/*
 * Check if the given range is hugepage aligned, and
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/12] get_unmapped_area handles MAP_FIXED on powerpc

Handle MAP_FIXED in powerpc's arch_get_unmapped_area() in all 3
implementations of it.

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>
Acked-by: William Irwin <[EMAIL PROTECTED]>

 arch/powerpc/mm/hugetlbpage.c |   21 +
 1 file changed, 21 insertions(+)

Index: linux-cell/arch/powerpc/mm/hugetlbpage.c
===
--- linux-cell.orig/arch/powerpc/mm/hugetlbpage.c   2007-04-24 
15:10:17.0 +1000
+++ linux-cell/arch/powerpc/mm/hugetlbpage.c2007-04-24 15:28:11.0 
+1000
@@ -566,6 +566,13 @@ unsigned long arch_get_unmapped_area(str
if (len > TASK_SIZE)
return -ENOMEM;
 
+   /* handle fixed mapping: prevent overlap with huge pages */
+   if (flags & MAP_FIXED) {
+   if (is_hugepage_only_range(mm, addr, len))
+   return -EINVAL;
+   return addr;
+   }
+
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
@@ -641,6 +648,13 @@ arch_get_unmapped_area_topdown(struct fi
if (len > TASK_SIZE)
return -ENOMEM;
 
+   /* handle fixed mapping: prevent overlap with huge pages */
+   if (flags & MAP_FIXED) {
+   if (is_hugepage_only_range(mm, addr, len))
+   return -EINVAL;
+   return addr;
+   }
+
/* dont allow allocations above current base */
if (mm->free_area_cache > base)
mm->free_area_cache = base;
@@ -823,6 +837,13 @@ unsigned long hugetlb_get_unmapped_area(
/* Paranoia, caller should have dealt with this */
BUG_ON((addr + len)  < addr);
 
+   /* Handle MAP_FIXED */
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(addr, len, pgoff))
+   return -EINVAL;
+   return addr;
+   }
+
if (test_thread_flag(TIF_32BIT)) {
curareas = current->mm->context.low_htlb_areas;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/12] get_unmapped_area handles MAP_FIXED on alpha

Handle MAP_FIXED in alpha's arch_get_unmapped_area(), simple case, just
return the address as passed in

Signed-off-by: Benjamin Herrenschmidt <[EMAIL PROTECTED]>

 arch/alpha/kernel/osf_sys.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux-cell/arch/alpha/kernel/osf_sys.c
===
--- linux-cell.orig/arch/alpha/kernel/osf_sys.c 2007-03-22 14:58:33.0 
+1100
+++ linux-cell/arch/alpha/kernel/osf_sys.c  2007-03-22 14:58:44.0 
+1100
@@ -1267,6 +1267,9 @@ arch_get_unmapped_area(struct file *filp
if (len > limit)
return -ENOMEM;
 
+   if (flags & MAP_FIXED)
+   return addr;
+
/* First, see if the given suggestion fits.
 
   The OSF/1 loader (/sbin/loader) relies on us returning an
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/12] Pass MAP_FIXED down to get_unmapped_area

This is a "first step" as there are still cleanups to be done in various
areas touched by that code but I think it's probably good to go as is and
at least enables me to implement what I need for PowerPC.

(Andrew, this is also candidate for 2.6.22 since I haven't had any real
objection, mostly suggestion for improving further, which I'll try to
do later, and I have further powerpc patches that rely on this).

The current get_unmapped_area code calls the f_ops->get_unmapped_area or
the arch one (via the mm) only when MAP_FIXED is not passed. That makes
it impossible for archs to impose proper constraints on regions of the
virtual address space. To work around that, get_unmapped_area() then
calls some hugetlbfs specific hacks.

This cause several problems, among others:

 - It makes it impossible for a driver or filesystem to do the same thing
that hugetlbfs does (for example, to allow a driver to use larger page
sizes to map external hardware) if that requires applying a constraint
on the addresses (constraining that mapping in certain regions and other
mappings out of those regions).

 - Some archs like arm, mips, sparc, sparc64, sh and sh64 already want
MAP_FIXED to be passed down in order to deal with aliasing issues.
The code is there to handle it... but is never called.

This serie of patches moves the logic to handle MAP_FIXED down to the
various arch/driver get_unmapped_area() implementations, and then changes
the generic code to always call them. The hugetlbfs hacks then disappear
from the generic code.

Since I need to do some special 64K pages mappings for SPEs on cell, I need
to work around the first problem at least. I have further patches thus
implementing a "slices" layer that handles multiple page sizes through
slices of the address space for use by hugetlbfs, the SPE code, and possibly
others, but it requires that serie of patches first/

There is still a potential (but not practical) issue due to the fact that
filesystems/drivers implemeting g_u_a will effectively bypass all arch
checks. This is not an issue in practice as the only filesystems/drivers
using that hook are doing so for arch specific purposes in the first place.

There is also a problem with mremap that will completely bypass all arch
checks. I'll try to address that separately, I'm not 100% certain yet how,
possibly by making it not work when the vma has a file whose f_ops has a
get_unmapped_area callback, and by making it use is_hugepage_only_range()
before expanding into a new area.

Also, I want to turn is_hugepage_only_range() into a more generic
is_normal_page_range() as that's really what it will end up meaning
when used in stack grow, brk grow and mremap.

None of the above "issues" however are introduced by this patch, they are
already there, so I think the patch can go ini for 2.6.22.

(Patch is against Linus current git, I'll give a go at -mm asap)

Cheers,
Ben.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: BUG: Null pointer dereference in fs/open.c

On Tue, 24 Apr 2007 05:10:04 + (GMT) William Heimbigner <[EMAIL PROTECTED]> 
wrote:

> > --- a/drivers/block/pktcdvd.c~packet-fix-error-handling
> > +++ a/drivers/block/pktcdvd.c
> > @@ -777,7 +777,8 @@ static int pkt_generic_packet(struct pkt
> > rq->cmd_flags |= REQ_QUIET;
> >
> > blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
> > -   ret = rq->errors;
> > +   if (rq->errors)
> > +   ret = -EIO;
> > out:
> > blk_put_request(rq);
> > return ret;
> > _
> 
> This patch fixes (or conceals?) the oops.
> 

Fixes.  But does the packet driver actually work OK for you?  Writes
files and stuff like that?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Make new setting of panic_on_oom

2007-04-23 Thread Yasunori Goto


I tested this patch. It worked well.
So, I fixed its description.

Please apply.


--

The current panic_on_oom may not work if there is a process using 
cpusets/mempolicy, because other nodes' memory may remain.
But some people want failover by panic ASAP even if they are used.
This patch makes new setting for its request.

This is tested on my ia64 box which has 3 nodes.

Please apply.

Signed-off-by: Yasunori Goto <[EMAIL PROTECTED]>
Signed-off-by: Benjamin LaHaise <[EMAIL PROTECTED]>


---
 Documentation/sysctl/vm.txt |   23 +--
 mm/oom_kill.c   |3 +++
 2 files changed, 20 insertions(+), 6 deletions(-)

Index: panic_on_oom2/Documentation/sysctl/vm.txt
===
--- panic_on_oom2.orig/Documentation/sysctl/vm.txt  2007-04-21 
12:39:09.0 +0900
+++ panic_on_oom2/Documentation/sysctl/vm.txt   2007-04-21 12:39:58.0 
+0900
@@ -197,11 +197,22 @@
 
 panic_on_oom
 
-This enables or disables panic on out-of-memory feature.  If this is set to 1,
-the kernel panics when out-of-memory happens.  If this is set to 0, the kernel
-will kill some rogue process, called oom_killer.  Usually, oom_killer can kill
-rogue processes and system will survive.  If you want to panic the system
-rather than killing rogue processes, set this to 1.
+This enables or disables panic on out-of-memory feature.
 
-The default value is 0.
+If this is set to 0, the kernel will kill some rogue process,
+called oom_killer.  Usually, oom_killer can kill rogue processes and
+system will survive.
+
+If this is set to 1, the kernel panics when out-of-memory happens.
+However, if a process limits using nodes by mempolicy/cpusets,
+and those nodes become memory exhaustion status, one process
+may be killed by oom-killer. No panic occurs in this case.
+Because other nodes' memory may be free. This means system total status
+may be not fatal yet.
 
+If this is set to 2, the kernel panics compulsorily even on the
+above-mentioned.
+
+The default value is 0.
+1 and 2 are for failover of clustering. Please select either
+according to your policy of failover.
Index: panic_on_oom2/mm/oom_kill.c
===
--- panic_on_oom2.orig/mm/oom_kill.c2007-04-21 12:39:09.0 +0900
+++ panic_on_oom2/mm/oom_kill.c 2007-04-21 12:40:31.0 +0900
@@ -409,6 +409,9 @@
show_mem();
}
 
+   if (sysctl_panic_on_oom == 2)
+   panic("out of memory. Compulsory panic_on_oom is selected.\n");
+
cpuset_lock();
read_lock(_lock);
 

-- 
Yasunori Goto 


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Rik van Riel


Paul Mackerras wrote:

Rik van Riel writes:


I guess we'll need to call tlb_remove_tlb_entry() inside the
MADV_FREE code to keep powerpc happy.


I don't see why; once ptep_test_and_clear_young has returned, the
entry in the hash table has already been removed. 


OK, so this one won't be necessary. Good to know that.

Andrew, it looks like things won't be that bad :)

--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: BUG: Null pointer dereference in fs/open.c

2007-04-23 Thread William Heimbigner


On Mon, 23 Apr 2007, Andrew Morton wrote:

On Tue, 24 Apr 2007 04:09:18 + (GMT) William Heimbigner <[EMAIL PROTECTED]> 
wrote:


This bug occurs in linux-2.6.20 and 2.6.21-rc7-git5, and does not occur in
linux-2.6.19-git22.

After running "pktsetup 0 /dev/hdd", I get (timestamps removed):

pktcdvd: pkt_get_last_written failed
BUG: unable to handle kernel NULL pointer dereference at virtual address 
000e
printing eip:
c0173f69
*pde = 
Oops:  [#1]
PREEMPT
Modules linked in: snd_ca0106 snd_ac97_codec ac97_bus 8139cp 8139too iTCO_wdt
CPU:0
EIP:0060:[]Not tainted VLI
EFLAGS: 00010203   (2.6.21-rc7-git5 #22)
EIP is at do_sys_open+0x59/0xd0
eax: 0002   ebx: 4020   ecx: 0001   edx: 0002
esi: df1e3000   edi: 0003   ebp: de17bfa4   esp: de17bf84
ds: 007b   es: 007b   fs: 00d8  gs: 0033  ss: 0068
Process vol_id (pid: 4273, ti=de17b000 task=df4143f0 task.ti=de17b000)
Stack:  c013d2a5 ff9c 0002 c059cea3 bfb6bf64 8000 b7f60ff4
de17bfb0 c017401c  de17b000 c01041c6 bfb6bf64 8000 
8000 b7f60ff4 bfb6a798 0005 007b 007b  0005
Call Trace:
  [] show_trace_log_lvl+0x1a/0x30
  [] show_stack_log_lvl+0xa9/0xd0
  [] show_registers+0x21c/0x3a0
  [] die+0x104/0x260
  [] do_page_fault+0x277/0x610
  [] error_code+0x74/0x7c
  [] sys_open+0x1c/0x20
  [] sysenter_past_esp+0x5f/0x99
  ===
Code: ff 85 c0 89 c7 78 77 8b 45 08 89 d9 89 f2 89 04 24 8b 45 e8 e8 69 ff
ff ff 3d 00 f0 ff ff 89 45 ec 77 71 8b 55 ec bb 20 00 00 40 <8b> 42 0c 8b
48 30 89 4d f0 0f b7 51 66 81 e2 00 f0 00 00 81 fa
EIP: [] do_sys_open+0x59/0xd0 SS:ESP 0068:de17bf84


Try this:

--- a/drivers/block/pktcdvd.c~packet-fix-error-handling
+++ a/drivers/block/pktcdvd.c
@@ -777,7 +777,8 @@ static int pkt_generic_packet(struct pkt
rq->cmd_flags |= REQ_QUIET;

blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
-   ret = rq->errors;
+   if (rq->errors)
+   ret = -EIO;
out:
blk_put_request(rq);
return ret;
_


This patch fixes (or conceals?) the oops.




The packet driver was assuming that request.errors is an errno, but it
isn't - it's some sort of diagnostic bitfield thing.  Now why would the
packet driver have though that?  Let's go read the comments:

unsigned short nr_hw_segments;

unsigned short ioprio;

void *special;
char *buffer;

int tag;
int errors;

int ref_count;


Well there's your root cause right there.


I don't know why this wasn't oopsing in eariler kernels.  Perhaps something
else is broken.  Please test this urgently.


There's a locking problem in there too.  `pktsetup 0 /dev/scd0' gives me

[   77.72] pktcdvd: writer pktcdvd0 mapped to sr0
[   77.86]
[   77.86] =
[   77.86] [ INFO: possible recursive locking detected ]
[   77.86] 2.6.21-rc7 #19
[   77.86] -
[   77.86] vol_id/2508 is trying to acquire lock:
[   77.86]  (>bd_mutex){--..}, at: [] do_open+0x5a/0x267
[   77.86]
[   77.86] but task is already holding lock:
[   77.86]  (>bd_mutex){--..}, at: [] do_open+0x5a/0x267
[   77.86]
[   77.86] other info that might help us debug this:
[   77.86] 2 locks held by vol_id/2508:
[   77.86]  #0:  (>bd_mutex){--..}, at: [] 
do_open+0x5a/0x267
[   77.86]  #1:  (_mutex#2){--..}, at: [] pkt_open+0x1a/0xcbc 
[pktcdvd]
[   77.86]
[   77.86] stack backtrace:
[   77.86]  [] __lock_acquire+0x11e/0xb3b
[   77.86]  [] __mutex_unlock_slowpath+0x109/0x113
[   77.86]  [] trace_hardirqs_on+0x11e/0x141
[   77.86]  [] lock_acquire+0x56/0x6e
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] mutex_lock_nested+0xf4/0x24f
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] kobj_lookup+0xda/0x104
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] __blkdev_get+0x5b/0x66
[   77.86]  [] blkdev_get+0x12/0x14
[   77.86]  [] pkt_open+0x8d/0xcbc [pktcdvd]
[   77.86]  [] __d_lookup+0x66/0xed
[   77.86]  [] __d_lookup+0x66/0xed
[   77.86]  [] _atomic_dec_and_lock+0xd/0x2c
[   77.86]  [] _atomic_dec_and_lock+0xd/0x2c
[   77.86]  [] _atomic_dec_and_lock+0xd/0x2c
[   77.86]  [] cache_alloc_refill+0x4a/0x444
[   77.86]  [] kobj_lookup+0x33/0x104
[   77.86]  [] trace_hardirqs_on+0x11e/0x141
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] __mutex_lock_slowpath+0x222/0x235
[   77.86]  [] mutex_lock_nested+0x23c/0x24f
[   77.86]  [] mark_held_locks+0x46/0x62
[   77.86]  [] mutex_lock_nested+0x23c/0x24f
[   77.86]  [] mutex_lock_nested+0x23c/0x24f
[   77.86]  [] trace_hardirqs_on+0x11e/0x141
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] mutex_lock_nested+0x247/0x24f
[   77.86]  [] do_open+0x5a/0x267
[   77.86]  [] kobj_lookup+0xda/0x104
[   77.86]  []

Re: MMCv4 support (8-bit support missing)

2007-04-23 Thread Pierre Ossman

Madhusudhan c wrote:
> 
> Suppose a host controller is capable of suporting 8-bit and it tells
> the core that it can support 8-bit. Now the card that is plugged in
> might or might not support 8-bit based on the type of the card. There
> is no field in the ext_csd which will tell you what bus width the card
> can support.
> 

I've looked through the MMC 4.2 spec and I see nothing in it that even hints
that 8-bit support might be optional. So as it stands, the bus testing is still 
out.

Rgds
Pierre




signature.asc
Description: OpenPGP digital signature

Re: [PATCH] powerpc pseries eeh: Convert to kthread API


> Further in general it doesn't make sense to grab a module reference
> and call that sufficient because we would like to request that the
> module exits.

Which is, btw, I think a total misdesign of our module stuff, but heh, I
remember that lead to some flamewars back then...

Like anything else, modules should have separated the entrypoints for

 - Initiating a removal request
 - Releasing the module

The former is use did "rmmod", can unregister things from subsystems,
etc... (and can file if the driver decides to refuse removal requests
when it's busy doing things or whatever policy that module wants to
implement).

The later is called when all references to the modules have been
dropped, it's a bit like the kref "release" (and could be implemented as
one).

If we had done that (simple) thing back then, module refcounting would
have been much less of a problem... I remember some reasons why that was
veto'ed but I didn't and still don't agree.

Ben.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: BUG/CRASH help - 2.6.20 (fc5) - spinlock bad magic on CPU#0, kswapd

2007-04-23 Thread Robert Hancock


Ian Kirk wrote:

Hi,

I just wondered if the below type of crash was a known thing, and if
there are any obvious things I can do to prevent/fix it ?

BUG: spinlock bad magic on CPU#0, kswapd0/242 (Not tainted)
lock: c06c9380, .magic: c06c9380, .owner:
[] _raw_spin_lock+0x1a/0xd9
[] _spin_lock_irqsave+0x9/0xd
[] __wake_up+0x18/0x43
[] __wake_up_bit+0x2e/0x33
[] shrink_inactive_list+0x483/0x6c9


RAM issues are a possibility for this sort of problem, tried running 
Memtest86+?


--
Robert Hancock  Saskatoon, SK, Canada
To email, remove "nospam" from [EMAIL PROTECTED]
Home Page: http://www.roberthancock.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: sendfile to nonblocking socket

2007-04-23 Thread Alex Vorona


David Schwartz пишет:

You have a misunderstanding about the semantics of 'sendfile'. The 'sendfile' 
function is just a more efficient version of a read followed by a write. If you 
did a read followed by a write, it would block as well (in the read).

DS
  
sendfile function is not just a more efficient version of a read 
followed by a write.  It reads from one fd and write to another at tha 
same time. Please try to read 2G, and then write 2G - and how much 
memory you will be need and how much time you will loose while reading 
2G from disk, but not writing them to socket.  If you know more 
efficient method to transfer file from disk to network - please advise. 
Now all I want is really non-blocking sendfile. Currently sendfile is 
non-blocking on network, but not on disk i/o. And when I have network 
faster than disk - I get block.


Thank you,
Alex


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

Paul Mackerras <[EMAIL PROTECTED]> writes:

> Eric W. Biederman writes:
>
>> Well the basic problem is that for any piece of code that can be modular
>> we need a way to ensure all threads it has running are shutdown when we
>> remove the module.
>
> The EEH code can't be modular, and wouldn't make any sense to be
> modular, since it's part of the infrastructure for accessing PCI
> devices.

Agreed.  However most kthread users are modular and make sense to
be so we need to design to handle modular users.

I don't think the idiom of go fire off a thread to handle something
is specific to non-modular users.

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-23 Thread Arjan van de Ven


> Within reason, it's not the number of clients that X has that causes its 
> CPU bandwidth use to sky rocket and cause problems.  It's more to to 
> with what type of clients they are.  Most GUIs (even ones that are 
> constantly updating visual data (e.g. gkrellm -- I can open quite a 
> large number of these without increasing X's CPU usage very much)) cause 
> very little load on the X server.  The exceptions to this are the 


there is actually 2 and not just 1 "X server", and they are VERY VERY
different in behavior.

Case 1: Accelerated driver

If X talks to a decent enough card it supports will with acceleration,
it will be very rare for X itself to spend any kind of significant
amount of CPU time, all the really heavy stuff is done in hardware, and
asynchronously at that. A bit of batching will greatly improve system
performance in this case.

Case 2: Unaccelerated VESA

Some drivers in X, especially the VESA and NV drivers (which are quite
common, vesa is used on all hardware without a special driver nowadays),
have no or not enough acceleration to matter for modern desktops. This
means the CPU is doing all the heavy lifting, in the X program. In this
case even a simple "move the window a bit" becomes quite a bit of a CPU
hog already.

The cases are fundamentally different in behavior, because in the first
case, X hardly consumes the time it would get in any scheme, while in
the second case X really is CPU bound and will happily consume any CPU
time it can get.



-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via 
http://www.linuxfirmwarekit.org

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: is there any generic GPIO chip framework like IRQ chips?

2007-04-23 Thread David Brownell

On Monday 23 April 2007, Paul Sokolovsky wrote:
> Hello David,
> 
> Thursday, April 19, 2007, 5:22:44 AM, you wrote:
> 
> >> >> > So, talking about what an (optional) implementation framework might
> >> >> > look like (and which could handle the SOC, FPGA, I2C, and MFD cases
> >> >> > I've looked at):
> >> 
> >> > See patches in following messages ... a preliminary "gpio_chip" core
> >> > for such a framework, plus example support for one SOC family's GPIOs,
> >> > and then updating one board's handling of GPIOs, including over I2C.
> >> 
> >> Just to compare, diffstats for GPIODEV:
>
> > Now, if they were functionally equivalent, such a comparison
> > would be less of an apples/oranges thing!
> 
> But of course they are functionally equivalent! They do the
> same thing - manage GPIOs. They even do it in very similar ways.

Functional equivalence means handling all the capabilities
defined in Documentation/gpio.txt ... they are defined
because without them, the programming interface wouldn't
cover significant capabilities folk asked for.

You omitted notions of:  spinlock safe/unsafe operation,
as needed to handle e.g. register access over I2C/SPI
versus spinlock-safe access; request/reserve, minimizing
inter-driver conflicts; direction setting, since it's most
often mutable.

It's no surprise when less-functional code is smaller...

> >>   it needs work - it doesn't adhere to your own
> >> optimization scheme by using lookup table instead of list.
> 
> > I thought it was more important to address the $SUBJECT first:
> 
> Well, is this your last argument why your implementation is
> better: GPIO chip framework is needed just because there's some random
> mail subject which mentions it?

I certainly pointed out that something like a gpio_chip would
be easy enough to do, back when the programming interfaces were
first discussed ... in the context of supporting more than the
initial implementations of platform-level SOC style GPIOs, after
things are under way.

Working on that was what I've called a "phase III" issue ... where
"phase I" was a programming interface that could be be easily and
widely adopted, and "phase II" was the initial adoption (platforms
and drivers).  With "phase II" well under way, broadening the scope
of implementations now make sense.

> Let's recollect with what the discussion started: 

Depends which discussion you mean.  I was continuing previous
ones; you seemed to want to start a new one.

> proposal 
> from myself and other HH.org developers of the alternative implementation,

No, that original proposal was a *REPLACEMENT INTERFACE* along
with a fairly nasty C++ish implementation.  You then reposted that
same code to a wider audience after ignoring the initial feedback,
then later a revised version with minor changes.  (After my latest
updates were working on multiple boards, but before I posted them.)

> And we both understand well why we can't reach agreement here:
> we represent different communities with different needs. 

That's not my understanding at all.  Though you're right to
imply that the HH.org community seems to **feel** (for reasons
opaque to most people) that its embedded needs are for some
reason different from those of folk working more closely with
the kernel.org codebase.

(IMO it's a microcosm of a classic embedded product approach:
hack a bunch of stuff so it works, try throwing it upstream,
and only then notice that merging takes real work, with thought
for what other systems with related problems need.)

> >> So, now the most important question is what we all would get
> >> with your approach in the end.
> >> 
> >> So, if you could make sure gpiolib.c doesn't contain inefficient
> >> implementation,
> 
> > I can make it comparable to existing implementations that work
> > the same way ... e.g. AT91 and OMAP code.
> 
> Why would you compare it to such implementations?

Because the alternatives were *MUCH* lighter weight (much less
flexible), so any comparisons would be unfair!  :)

> I also hear shade of encouragement to keep elaborating
> and submitting GPIODEV in your words, but unfortunately that's not 
> really what I'd like. The whole argument was about having one good
> runtime-extensible framework, not two.

The thing that's potentially interesting in what you've said is
what an API using {gpio_chip, index} addressing would look like.
The extensibility would be through gpio_chip.

The notion has come up before (even from me!), and I recently
noticed some CRIS GPIO code looking something like that.

It's not clear such a thing is needed very often ... but as I've
structured things, it would be easy to add such a layer, should
it turn out to be necessary.  That's why "struct gpio_chip" is
such a relevant next step.

- Dave

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at

Re: sendfile to nonblocking socket

2007-04-23 Thread Alex Vorona


David Miller wrote:

From: voron <[EMAIL PROTECTED]>
Date: Tue, 24 Apr 2007 00:13:27 +0300

  
As I see, nonblocking mode is enabled - sendfile sends less than asked.  



The socket is marked as non-blocking, but the disk I/O is not.

It's blocking on the disk I/O not the socket part of the operation.


  

How can I told kernel to not block on disk I/O? I tried non-blocking on
disk i/o fd, but it  seems to be ignored. Strace with both nonblocing
disk fd and socket fd in attach
#non-blocking on socket fd enabled
04:34:07 ioctl(9, FIONBIO, [1]) = 0
#non-blocking on disk fd enabled
04:34:11 ioctl(12, FIONBIO, [1])= 0
#normal sendfile
04:34:07 sendfile(9, 12, [444282], 2147477638) = 812682
#32 seconds sendfile
04:34:11 sendfile(9, 12, [261474962], 2147476846) = 2144612230

Thank you,
Alex

04:34:04 write(3, "2007/04/24 04:34:04 [info] 32390"..., 85) = 85
04:34:04 epoll_wait(11, {{EPOLLIN, {u32=3618562064, u64=61614924423184}}}, 512, 
4294967295) = 1
04:34:07 accept(8, {sa_family=AF_INET, sin_port=htons(52673), 
sin_addr=inet_addr("192.168.78.1")}, [5056848310527066128]) = 9
04:34:07 ioctl(9, FIONBIO, [1]) = 0
04:34:07 epoll_ctl(11, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLET, {u32=3618562369, 
u64=61614924423489}}) = 0
04:34:07 epoll_wait(11, {{EPOLLIN, {u32=3618562369, u64=61614924423489}}}, 512, 
60) = 1
04:34:07 recvfrom(9, "GET /3.tmp HTTP/1.0\r\nUser-Agent:"..., 1024, 0, NULL, 
NULL) = 147
04:34:07 open("/var/www/cacti/htdocs/3.tmp", O_RDONLY) = 12
04:34:07 fstat(12, {st_mode=S_IFREG|0644, st_size=4294967296, ...}) = 0
04:34:07 setsockopt(9, SOL_TCP, TCP_CORK, [1], 4) = 0
04:34:07 writev(9, [{"HTTP/1.1 200 OK\r\nServer: nginx/0"..., 262}], 1) = 262
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [0], 2147475456) = 20576
04:34:07 epoll_ctl(11, EPOLL_CTL_MOD, 9, {EPOLLIN|EPOLLOUT|EPOLLET, 
{u32=3618562369, u64=61614924423489}}) = 0
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 60) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [20576], 2147479456) = 55568
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599975) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [76144], 2147477136) = 41676
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599973) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [117820], 2147476420) = 83352
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599972) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [201172], 2147479084) = 243110
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599971) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [444282], 2147477638) = 812682
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599968) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [1256964], 2147475964) = 7501680
04:34:07 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599950) = 1
04:34:07 ioctl(12, FIONBIO, [1])= 0
04:34:07 sendfile(9, 12, [8758644], 2147478156) = 234274688
04:34:10 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599846) = 1
04:34:10 ioctl(12, FIONBIO, [1])= 0
04:34:10 sendfile(9, 12, [24302], 2147478284) = 583464
04:34:10 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 60) = 1
04:34:10 ioctl(12, FIONBIO, [1])= 0
04:34:10 sendfile(9, 12, [243616796], 2147476452) = 16712076
04:34:11 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 50) = 1
04:34:11 ioctl(12, FIONBIO, [1])= 0
04:34:11 sendfile(9, 12, [260328872], 2147476056) = 1146090
04:34:11 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599766) = 1
04:34:11 ioctl(12, FIONBIO, [1])= 0
04:34:11 sendfile(9, 12, [261474962], 2147476846) = 2144612230
04:34:43 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599754) = 1
04:34:43 ioctl(12, FIONBIO, [1])= 0
04:34:43 sendfile(9, 12, [2406087192], 180104) = 611248
04:34:43 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 60) = 1
04:34:43 ioctl(12, FIONBIO, [1])= 0
04:34:43 sendfile(9, 12, [2406698440], 1888268856) = 1000890816
04:34:57 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 51) = 1
04:34:57 ioctl(12, FIONBIO, [1])= 0
04:34:57 sendfile(9, 12, [3407589256], 887378040) = 972440
04:34:58 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 60) = 1
04:34:58 ioctl(12, FIONBIO, [1])= 0
04:34:58 sendfile(9, 12, [3408561696], 886405600) = 861304
04:34:58 epoll_wait(11, {{EPOLLOUT, {u32=3618562369, u64=61614924423489}}}, 
512, 599977) = 1
04:34:58 ioctl(12, FIONBIO, [1])= 0

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Paul Mackerras

Rik van Riel writes:

> I guess we'll need to call tlb_remove_tlb_entry() inside the
> MADV_FREE code to keep powerpc happy.

I don't see why; once ptep_test_and_clear_young has returned, the
entry in the hash table has already been removed.  Adding the
tlb_remove_tlb_entry call certainly won't do anything on 64-bit
powerpc, since it expands to do {} while (0) there, and in fact it
won't do anything on 32-bit powerpc either.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] another scheduler beater

2007-04-23 Thread Mike Galbraith

On Mon, 2007-04-23 at 17:55 -0400, Bill Davidsen wrote:

> Based on just this script as load I would say renice on X isn't a good 
> thing. Based on one small test, I would say that renice of X in 
> conjunction with heavy disk i/o and a single fast scrolling xterm (think 
> kernel compile) seems to slow the raid6 thread measurably.

Yeah, I was testing a UP kernel yesterday.  It doesn't take much renice
to severely injure the competition with them having no place else to go.
Nice -19 is definitely overkill.

-Mike

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

2007-04-23 Thread Paul Mackerras

Eric W. Biederman writes:

> Well the basic problem is that for any piece of code that can be modular
> we need a way to ensure all threads it has running are shutdown when we
> remove the module.

The EEH code can't be modular, and wouldn't make any sense to be
modular, since it's part of the infrastructure for accessing PCI
devices.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] oom: kill all threads that share mm with killed task

2007-04-23 Thread David Rientjes

oom_kill_task() calls __oom_kill_task() to OOM kill a selected task.
When finding other threads that share an mm with that task, we need to
kill those individual threads and not the same one.

Cc: Andi Kleen <[EMAIL PROTECTED]>
Cc: Christoph Lameter <[EMAIL PROTECTED]>
Signed-off-by: David Rientjes <[EMAIL PROTECTED]>
---
 mm/oom_kill.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -333,7 +333,7 @@ static int oom_kill_task(struct task_struct *p)
 */
do_each_thread(g, q) {
if (q->mm == mm && q->tgid != p->tgid)
-   force_sig(SIGKILL, p);
+   force_sig(SIGKILL, q);
} while_each_thread(g, q);
 
return 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 23/25] xen: Lockdep fixes for xen-netfront

2007-04-23 Thread Jeremy Fitzhardinge

Herbert Xu wrote:
> You don't need to disable BH in netif_poll since it's always called
> with BH disabled.
>   

Ah, yes, you mentioned that before.  I'll fix it up.

J
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-23 Thread Jeremy Fitzhardinge

Herbert Xu wrote:
> Thanks Jeremy.  The patch looks good.

Could you give netfront an overall review as well?  I know you're
already pretty familiar with it, but if you could cast a fresh eye over
it, that would be helpful.

Thanks,
J
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH try #2] Return access error not ECHILD on security_task_wait failure

2007-04-23 Thread Roland McGrath

Not having tested it or anything, that looks good to me.


Thanks,
Roland
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 03/25] xen: Add nosegneg capability to the vsyscall page notes

2007-04-23 Thread Roland McGrath

> I have to admit I still don't really understand all this. Is it
> documented somewhere?

I have explained it in public more than once, but I don't know off hand
anywhere that was helpfully recorded.

> What does "hwcap 0 nosegneg" actually mean? What does the "0" mean here?

ldconfig is usually run at library install time. It reads ld.so.conf (and
its include files, usually found in /etc/ld.so.conf.d/*.conf). ldconfig
finds libraries on the disk and stores their names in ld.so.cache. For
libraries in "hwcap directories", it records in ld.so.cache a bitmask of
hwcap bits for each library, based on which hwcap names appeared in the
library file name. The hard-wired hwcap names are things such as "mmx" and
"sse2", a chosen subset of the AT_HWCAP bits the kernel provides.

When the dynamic linker is finding a library at runtime, it uses a match
from ld.so.cache unless none was found or LD_LIBRARY_PATH was set. When
there is no cache hit, it searches a directory path for the library's name.
The subset of hwcap names whose AT_HWCAP bits are set yields a list of
subdirectories to try under each directory in the path. To see the list:

$ LD_LIBRARY_PATH=/lib:/usr/lib LD_DEBUG=libs /bin/true
21491: find library=libc.so.6 [0]; searching
21491: search
path=/lib/tls/i686/sse2:/lib/tls/i686:/lib/tls/sse2:/lib/tls:/lib/i686/sse2:/lib/i686:/lib/sse2:/lib:/usr/lib/tls/i686/sse2:/usr/lib/tls/i686:/usr/lib/tls/sse2:/usr/lib/tls:/usr/lib/i686/sse2:/usr/lib/i686:/usr/lib/sse2:/usr/lib
(system search path)

(Here you can notice that "tls" is used as a pseudo-hwcap; it is in fact
hard-wired into the list if the dynamic linker supports ELF TLS, which all
recent ones do. Also you'll notice "i686" is not a hwcap bit, but is the
AT_PLATFORM string, which is treated similarly.)

The hwcap bitmasks in ld.so.cache are intended to make the single cache
file equivalent to this varying search path depending on runtime hwcap bits
set. A cache entry whose bitmask has bits not set at runtime is ignored.
Running "ldconfig -p | grep hwcap" (read-only, need not be root) will show
you any entries in your ld.so.cache that have an hwcap bitmask set.

The "hwcap" directive in ld.so.conf tells ldconfig to understand a new
hwcap name that is not in the hard-wired set. There is some number of
extra bits available; "hwcap 0" assigns the first extra bit, "hwcap 1" the
second, and so on. The name is what to use as a subdirectory name,
analogous to "sse2" et al.

On my system with an ld.so.conf.d file installed doing "hwcap 0 nosegneg":

$ ldconfig -p | grep libc.so.6
libc.so.6 (libc6, hwcap: 0x0018, OS ABI: Linux 2.6.9) =>
/lib/i686/nosegneg/libc.so.6
libc.so.6 (libc6, OS ABI: Linux 2.6.9) => /lib/libc.so.6

(There are two bits set because the "tls" pseudo-bit is also set.)

With this in ld.so.cache, the libc.so.6 lookup will find
/lib/i686/nosegneg/libc.so.6 first, but only if the hwcap bit set.

> In the ELF note, what does the "nosegneg" string mean? How is it used?
> Is it compared to the "nosegneg" in ld.so.conf? How does this relate to
> the bitfields?

Each bit + string element in the note (there's just the one in what we
have) establishes for the dynamic linker at runtime the association between
the "extra" pseudo-hwcap bit number and the name. If that pseudo-hwcap is
enabled, then that string will figure into the directory search path as
"sse2" does in the example above. This string is never consulted when
looking in ld.so.cache.

The mask field in NOTE_KERNELCAP_BEGIN says which "extra" bits are enabled.
If the corresponding bit is not set here, then it's just like a hard-wired
hwcap bit like "sse2" when that bit was not set in AT_HWCAP. That is, a
cache lookup will ignore entries with that hwcap bit in their bitmask, and
that hwcap name will not be used in constructing the directory search path.
I put this bitmask in so that the kernel has the option of using a single
vDSO image for multiple different runtime configurations. It can simply
modify the bitmask in the image at setup time to disable some entries.

For example, a Xen-enabled kernel can use a single vDSO image (or a single
pair of int80/sysenter images), containing the "nosegneg" hwcap note. When
there is no need for it (native or hvm or 64-bit hv or whatever), it just
clears the mask word. If you actually do this, you'll want to modify the
NOTE_KERNELCAP_BEGIN macro to define a global label you can use with VDSO_SYM.

Thanks,
Roland
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Re: BUG: Null pointer dereference in fs/open.c

2007-04-23 Thread William Heimbigner

This bug occurs in linux-2.6.20 and 2.6.21-rc7-git5, and does not occur in 
linux-2.6.19-git22.


After running "pktsetup 0 /dev/hdd", I get (timestamps removed):

pktcdvd: pkt_get_last_written failed
BUG: unable to handle kernel NULL pointer dereference at virtual address 
000e
printing eip:
c0173f69
*pde = 
Oops:  [#1]
PREEMPT
Modules linked in: snd_ca0106 snd_ac97_codec ac97_bus 8139cp 8139too iTCO_wdt
CPU:0
EIP:0060:[]Not tainted VLI
EFLAGS: 00010203   (2.6.21-rc7-git5 #22)
EIP is at do_sys_open+0x59/0xd0
eax: 0002   ebx: 4020   ecx: 0001   edx: 0002
esi: df1e3000   edi: 0003   ebp: de17bfa4   esp: de17bf84
ds: 007b   es: 007b   fs: 00d8  gs: 0033  ss: 0068
Process vol_id (pid: 4273, ti=de17b000 task=df4143f0 task.ti=de17b000)
Stack:  c013d2a5 ff9c 0002 c059cea3 bfb6bf64 8000 b7f60ff4
   de17bfb0 c017401c  de17b000 c01041c6 bfb6bf64 8000 
   8000 b7f60ff4 bfb6a798 0005 007b 007b  0005
Call Trace:
 [] show_trace_log_lvl+0x1a/0x30
 [] show_stack_log_lvl+0xa9/0xd0
 [] show_registers+0x21c/0x3a0
 [] die+0x104/0x260
 [] do_page_fault+0x277/0x610
 [] error_code+0x74/0x7c
 [] sys_open+0x1c/0x20
 [] sysenter_past_esp+0x5f/0x99
 ===
Code: ff 85 c0 89 c7 78 77 8b 45 08 89 d9 89 f2 89 04 24 8b 45 e8 e8 69 ff 
ff ff 3d 00 f0 ff ff 89 45 ec 77 71 8b 55 ec bb 20 00 00 40 <8b> 42 0c 8b 
48 30 89 4d f0 0f b7 51 66 81 e2 00 f0 00 00 81 fa

EIP: [] do_sys_open+0x59/0xd0 SS:ESP 0068:de17bf84


from fs/open.c, comments added:
// do_sys_open is consistently called with dfd=0xff9c,
// filename="/dev/.tmp-254-0", flags=0x8000, mode=0)
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
char *tmp = getname(filename);
int fd = PTR_ERR(tmp);

if (!IS_ERR(tmp)) {
fd = get_unused_fd();
if (fd >= 0) {
// do_filp_open consistently returns 2, in this case
struct file *f = do_filp_open(dfd, tmp, flags, mode);
// IS_ERR always returns 0 for this command
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
// null pointer dereference occurs here
fsnotify_open(f->f_path.dentry);
fd_install(fd, f);
}
}
putname(tmp);
}
return fd;
}

I was able to workaround this, by testing if do_filp_open was returning 
2 or not, but obviously this is a very temporal solution to a very 
specific circumstance.


If there is any more information I can provide, let me know.
William Heimbigner
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Permanent Kgdb integration into the kernel - lets get with it. (Dave: How do FreeBSD folks maintain the KGDB stub?)

2007-04-23 Thread Piet Delaney

On Sat, 2007-04-21 at 11:48 +0200, Andi Kleen wrote:
> > Lots of people want kgdb.  One person is famously less keen on it, but
> > we'll be able to talk him around, as long as the patches aren't daft.
> 
> The big question is if the kgdb developers seriously want mainline.
> At least in the past this definitely wasn't the case.

I haven't seen any email from kgdb developers saying they didn't want
kgdb to be part of mainline. 

Happen to have any e-mail demonstrating that? 

It's appears to me that:

1. Jason Wessel is putting a lot of effort at that right now.

2. Tom Rini worked hard at this just a few months ago.

3. George Anzinger was working hard at this a year or two
   with the mm series and as likely disappointed when it wasn't
   put into the mainline. As I recall the reason Linus gave
   was that there were two competing patches and he wanted that
   be resolved before integrating it into the mainline. So
   George worked with Amit at SourceForge over that past year
   or two and it's now integrated.

> 
> If they're not open to change requests from mainline reviewers we don't
> even need to bother to start the whole exercise.

What issue are there of have been that your referring to?

Once KGDB is part of KORG can't it's maintenance and support be
a kernel wide responsibility. If someone breaks kgdb shouldn't
that be backed out until the KORG developers fixes the problem?
Centralizing the responsibility for KGDB seems like mistake. I 
doubt the FreeBSD folks rip out the KGDB support of a kernel hacker
breaks KGDB and then leaves a group of KGDB developers to sort out
the problem. Seems it should be cough as a mm patch with Andrew tossing
out the patch if it breaks KGDB. Kgdb developers could try to give
Andrew a heads up if this occurs and he didn't notice it.

Once KGDB is integrated the maintenance should be minimal and changes
that break KGDB are likely best addressed by the developer that just
broke it. At least that what I'd think is an optimal approach. Perhaps
Dave O'Brien could tell us how the FreeBSD folks take care for KGDB.

> 
> Just putting their stuff onto korg isn't enough.

Yep, and once it's integrated into korg it should finally become
a permanent part of the kernel and I suspect maintained by all
kernel developers. New KGDB features could be developed at SourceForge
but maintaining kernel coherence seems like a global responsibility.
Like running fault injection on your code before checking it in.

Maybe I'm totally out to lunch on this; perhaps Dave O'Brien
can straighten me our if I'm wrong or the Linux kernel core
responsibility paradigm are incompatible with this.

I'd prefer Linux being just as good as NetBSD with Debugging support; 
current presentations like:

http://foss.in/2005/slides/netbsd-linux.pdf

show our current support as being much worse. Let's fix it.

You developed a kgdb proxy for Keith Owens kdb and I suspect you
would like to have KGDB being part of the kernel mainline as
long as it's done well. I doubt anyone would argue with that.

Perhaps it's possible to eventually setup KGDB so it can be 
debugged with kdb. Once KGDB is mainline that are plenty of
issues that can be addressed; for example taking a kernel
core dump after dropping into kgdb and having the registers
show up correctly in Dave Anderson's crash utility.

-piet

> 
> -Andi
-- 
Piet DelaneyPhone: (408) 200-5256
Blue Lane Technologies  Fax:   (408) 200-5299
10450 Bubb Rd.
Cupertino, Ca. 95014Email: [EMAIL PROTECTED]

signature.asc
Description: This is a digitally signed message part

Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-23 Thread Peter Williams


Linus Torvalds wrote:


On Mon, 23 Apr 2007, Ingo Molnar wrote:
The "give scheduler money" transaction can be both an "implicit 
transaction" (for example when writing to UNIX domain sockets or 
blocking on a pipe, etc.), or it could be an "explicit transaction": 
sched_yield_to(). This latter i've already implemented for CFS, but it's 
much less useful than the really significant implicit ones, the ones 
which will help X.


Yes. It would be wonderful to get it working automatically, so please say 
something about the implementation..


The "perfect" situation would be that when somebody goes to sleep, any 
extra points it had could be given to whoever it woke up last. Note that 
for something like X, it means that the points are 100% ephemeral: it gets 
points when a client sends it a request, but it would *lose* the points 
again when it sends the reply!


So it would only accumulate "scheduling points" while multiuple clients 
are actively waiting for it, which actually sounds like exactly the right 
thing. However, I don't really see how to do it well, especially since the 
kernel cannot actually match up the client that gave some scheduling 
points to the reply that X sends back.


There are subtle semantics with these kinds of things: especially if the 
scheduling points are only awarded when a process goes to sleep, if X is 
busy and continues to use the CPU (for another client), it wouldn't give 
any scheduling points back to clients and they really do accumulate with 
the server. Which again sounds like it would be exactly the right thing 
(both in the sense that the server that runs more gets more points, but 
also in the sense that we *only* give points at actual scheduling events).


But how do you actually *give/track* points? A simple "last woken up by 
this process" thing that triggers when it goes to sleep? It might work, 
but on the other hand, especially with more complex things (and networking 
tends to be pretty complex) the actual wakeup may be done by a software 
irq. Do we just say "it ran within the context of X, so we assume X was 
the one that caused it?" It probably would work, but we've generally tried 
very hard to avoid accessing "current" from interrupt context, including 
bh's.


Within reason, it's not the number of clients that X has that causes its 
CPU bandwidth use to sky rocket and cause problems.  It's more to to 
with what type of clients they are.  Most GUIs (even ones that are 
constantly updating visual data (e.g. gkrellm -- I can open quite a 
large number of these without increasing X's CPU usage very much)) cause 
very little load on the X server.  The exceptions to this are the 
various terminal emulators (e.g. xterm, gnome-terminal, etc.) when being 
used to run output intensive command line programs e.g. try "ls -lR /" 
in an xterm.  The other way (that I've noticed) X's CPU usage bandwidth 
sky rocket is when you grab a large window and wiggle it about a lot and 
hopefully this doesn't happen a lot so the problem that needs to be 
addressed is the one caused by text output on xterm and its ilk.


So I think that an elaborate scheme for distributing "points" between X 
and its clients would be overkill.  A good scheduler will make sure 
other tasks such as audio streamers get CPU when they need it with good 
responsiveness even when X takes off by giving them higher priority 
because their CPU bandwidth use is low.


The one problem that might still be apparent in these cases is the mouse 
becoming jerky while X is working like crazy to spew out text too fast 
for anyone to read.  But the only way to fix that is to give X more 
bandwidth but if it's already running at about 95% of a CPU that's 
unlikely to help.  To fix this you would probably need to modify X so 
that it knows re-rendering the cursor is more important than rendering 
text in an xterm.


In normal circumstances, the re-rendering of the mouse happens quickly 
enough for the user to experience good responsiveness because X's normal 
CPU use is low enough for it to be given high priority.


Just because the O(1) tried this model and failed doesn't mean that the 
model is bad.  O(1) was a flawed implementation of a good model.


Peter
PS Doing a kernel build in an xterm isn't an example of high enough 
output to cause a problem as (on my system) it only raises X's 
consumption from 0 to 2% to 2 to 5%.  The type of output that causes the 
problem is usually flying past too fast to read.

--
Peter Williams   [EMAIL PROTECTED]

"Learning, n. The kind of ignorance distinguishing the studious."
 -- Ambrose Bierce
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc7: BUG: sleeping function called from invalid context at net/core/sock.c:1523

Jiri Kosina <[EMAIL PROTECTED]> wrote:
> 
> Hmm, *sigh*. I guess the patch below fixes the problem, but it is a 
> masterpiece in the field of ugliness. And I am not sure whether it is 
> completely correct either. Are there any immediate ideas for better 
> solution with respect to how struct sock locking works?

Please cc such patches to netdev.  Thanks.

> diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
> index 71f5cfb..c5c93cd 100644
> --- a/net/bluetooth/hci_sock.c
> +++ b/net/bluetooth/hci_sock.c
> @@ -656,7 +656,10 @@ static int hci_sock_dev_event(struct notifier_block 
> *this, unsigned long event,
>/* Detach sockets from device */
>read_lock(_sk_list.lock);
>sk_for_each(sk, node, _sk_list.head) {
> -   lock_sock(sk);
> +   if (in_atomic())
> +   bh_lock_sock(sk);
> +   else
> +   lock_sock(sk);

This doesn't do what you think it does.  bh_lock_sock can still succeed
even with lock_sock held by someone else.

Does this need to occur immediately when an event occurs? If not I'd
suggest moving this into a workqueue.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 23/25] xen: Lockdep fixes for xen-netfront

Jeremy Fitzhardinge <[EMAIL PROTECTED]> wrote:
>
> @@ -1212,10 +1212,10 @@ static int netif_poll(struct net_device 
>int pages_flipped = 0;
>int err;
> 
> -   spin_lock(>rx_lock);
> +   spin_lock_bh(>rx_lock);
> 
>if (unlikely(!netfront_carrier_ok(np))) {
> -   spin_unlock(>rx_lock);
> +   spin_unlock_bh(>rx_lock);

You don't need to disable BH in netif_poll since it's always called
with BH disabled.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

Benjamin Herrenschmidt <[EMAIL PROTECTED]> writes:

> Not sure... I can see places where I might want to spawn an arbitrary
> number of these without having to preallocate structures... and if I
> allocate on the fly, then I need a way to free that structure when the
> kthread is reaped which I don't think we have currently, do we ? (In
> fact, I could use that for other things too now that I'm thinking of
> it ... I might have a go at providing optional kthread destructors).

Well the basic problem is that for any piece of code that can be modular
we need a way to ensure all threads it has running are shutdown when we
remove the module.

Which means a fire and forget model however simple is unfortunately
the wrong thing.

Now we might be able to wrap this in some kind of manager construct,
so you don't have to manage each thread individually, but we still
have the problem of ensuring all of the threads exit when we terminate
the module.

Further in general it doesn't make sense to grab a module reference
and call that sufficient because we would like to request that the
module exits.

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

On Mon, 23 Apr 2007 22:53:49 -0400 Rik van Riel <[EMAIL PROTECTED]> wrote:

> I don't see why we need the attached, but in case you find
> a good reason, here's my signed-off-by line for Andrew :)

Andew is in a defensive crouch trying to work his way through all the bugs
he's been sent.  After I've managed to release 2.6.21-rc7-mm1 (say, December)
I expect I'll drop the MADV_FREE stuff, give you a run at creating a new
patch series.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 10/10] mm: per device dirty threshold

On Friday April 20, [EMAIL PROTECTED] wrote:
> Scale writeback cache per backing device, proportional to its writeout speed.

So it works like this:

 We account for writeout in full pages.
 When a page has the Writeback flag cleared, we account that as a
 successfully retired write for the relevant bdi.
 By using floating averages we keep track of how many writes each bdi
 has retired 'recently' where the unit of time in which we understand
 'recently' is a single page written.

 We keep a floating average for each bdi, and a floating average for
 the total writeouts (that 'average' is, of course, 1.)

 Using these numbers we can calculate what faction of 'recently'
 retired writes were retired by each bdi (get_writeout_scale).

 Multiplying this fraction by the system-wide number of pages that are
 allowed to be dirty before write-throttling, we get the number of
 pages that the bdi can have dirty before write-throttling the bdi.

 I note that the same fraction is *not* applied to background_thresh.
 Should it be?  I guess not - there would be interesting starting
 transients, as a bdi which had done no writeout would not be allowed
 any dirty pages, so background writeout would start immediately,
 which isn't what you want... or is it?

 For each bdi we also track the number of (dirty, writeback, unstable)
 pages and do not allow this to exceed the limit set for this bdi.

 The calculations involving 'reserve' in get_dirty_limits are a little
 confusing.  It looks like you calculating how much total head-room
 there is for the bdi (pages that the system can still dirty - pages
 this bdi has dirty) and making sure the number returned in pbdi_dirty
 doesn't allow more than that to be used.  This is probably a
 reasonable thing to do but it doesn't feel like the right place.  I
 think get_dirty_limits should return the raw threshold, and
 balance_dirty_pages should do both tests - the bdi-local test and the
 system-wide test.

 Currently you have a rather odd situation where
+   if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+   break;
 might included numbers obtained with bdi_stat_sum being compared with
 numbers obtained with bdi_stat.


 With these patches, the VM still (I think) assumes that each BDI has
 a reasonable queue limit, so that writeback_inodes will block on a
 full queue.  If a BDI has a very large queue, balance_dirty_pages
 will simply turn lots of DIRTY pages into WRITEBACK pages and then
 think "We've done our duty" without actually blocking at all.

 With the extra accounting that we now have, I would like to see
 balance_dirty_pages dirty pages wait until RECLAIMABLE+WRITEBACK is
 actually less than 'threshold'.  This would probably mean that we
 would need to support per-bdi background_writeout to smooth things
 out.  Maybe that it fodder for another patch-set.

 You set:
+   vm_cycle_shift = 1 + ilog2(vm_total_pages);

 Can you explain that?  My experience is that scaling dirty limits
 with main memory isn't what we really want.  When you get machines
 with very large memory, the amount that you want to be dirty is more
 a function of the speed of your IO devices, rather than the amount
 of memory, otherwise you can sometimes see large filesystem lags
 ('sync' taking minutes?)

 I wonder if it makes sense to try to limit the dirty data for a bdi
 to the amount that it can write out in some period of time - maybe 3
 seconds.  Probably configurable.  You seem to have almost all the
 infrastructure in place to do that, and I think it could be a
 valuable feature.

 At least, I think vm_cycle_shift should be tied (loosely) to 
   dirty_ratio * vm_total_pages
 ??

On the whole, looks good!

Thanks,
NeilBrown
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Remove open coded implementations of memclear_highpage flush

2007-04-23 Thread Satyam Sharma


On 4/24/07, Christoph Lameter <[EMAIL PROTECTED]> wrote:

On Tue, 24 Apr 2007, Satyam Sharma wrote:
> If I remember right, a very similar patchset was recently submitted
> that Andrew merged in -mm(?). It also renamed memclear_highpage_flush
> to something like zero_user_page (though I wonder how good a name that
> is considering it takes an offset and not the whole page) and
> deprecated the old name.

My latest tree from Andrew does not have any of this. URL of patch?


fs-deprecate-memclear_highpage_flush.patch (and friends, search for
zero_user_page) in
ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/mm/broken-out-2007-04-11-02-24.tar.gz
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [mmc] alternative TI FM MMC/SD driver for 2.6.21-rc7

2007-04-23 Thread Alex Dubov

> 
> I am not in any way argue that your driver architecture is wrong or that you
> should change anything. My point was simple. [tifm_sd] can only work with
> [tifm_7xx1]. If you add support for let's say [tifm_8xx2] in the future, which
> would have port offsets different that [tifm_7xx1], you would also need a
> completely new modules for slots (sd, ms, etc).
> 

Does not this constitutes an unbounded speculation? And then, what would you 
propose to do with
adapters that have SD support disabled? There are quite a few of those in the 
wild, as of right
now (SD support is provided by bundled SDHCI on such systems, if at all). 
Similar argument goes
for other media types as well - many controllers have xD support disabled too 
(I think you have
one of those - Sony really values its customers). After all, it is not healthy 
to have dead code
in the kernel.

On the other hand, if TI puts out a controller which is functionally identical, 
but has different
register map, it wouldn't be hard to refactor the code. 


__
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Rik van Riel


Nick Piggin wrote:


What the tlb flush used to be able to assume is that the page
has been removed from the pagetables when they are put in the
tlb flush batch.


I think this is still the case, to a degree.  There should be
no harm in removing the TLB entries after the page table has
been unlocked, right?

Or is something like the attached really needed?

From what I can see, the page table lock should be enough
synchronization between unmap_mapping_range, MADV_FREE and
MADV_DONTNEED.

I don't see why we need the attached, but in case you find
a good reason, here's my signed-off-by line for Andrew :)

Signed-off-by: Rik van Riel <[EMAIL PROTECTED]>

--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.
--- linux-2.6.20.x86_64/mm/memory.c.flushme	2007-04-23 22:26:06.0 -0400
+++ linux-2.6.20.x86_64/mm/memory.c	2007-04-23 22:42:06.0 -0400
@@ -628,6 +628,7 @@ static unsigned long zap_pte_range(struc
 long *zap_work, struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
+	unsigned long start_addr = addr;
 	pte_t *pte;
 	spinlock_t *ptl;
 	int file_rss = 0;
@@ -726,6 +727,11 @@ static unsigned long zap_pte_range(struc
 
 	add_mm_rss(mm, file_rss, anon_rss);
 	arch_leave_lazy_mmu_mode();
+	if (details && details->madv_free) {
+		/* Protect against MADV_DONTNEED or unmap_mapping_range */
+		tlb_finish_mmu(tlb, start_addr, addr);
+		tlb = tlb_gather_mmu(mm, 0);
+	}
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return addr;

Re: [PATCH] mm: PageLRU can be non-atomic bit operation

2007-04-23 Thread Nick Piggin

Hisashi Hifumi wrote:

At 22:42 07/04/23, Hugh Dickins wrote:
 >On Mon, 23 Apr 2007, Hisashi Hifumi wrote:
 >> >No.  The PG_lru flag bit is just one bit amongst many others:
 >> >what of concurrent operations changing other bits in that same
 >> >unsigned long e.g. trying to lock the page by setting PG_locked?
 >> >There are some places where such micro-optimizations can be made
 >> >(typically while first allocating the page); but in general, no.
 >>
 >> In i386 and x86_64, btsl is used to change page flag. In this case, 
if btsl

 >> without lock prefix
 >> set PG_locked and PG_lru flag concurrently, does only one operation
 >> succeed ?
 >
 >That's right: on an SMP machine, without the lock prefix, the operation
 >is no longer atomic: what's stored back may be missing the result of
 >one or the other of the racing operations.
 >

In the case that changing the same bit concurrently, lock prefix or other
spinlock is needed. But, I think that concurrent bit operation on 
different bits

is just like OR operation , so lock prefix is not needed.

AMD instruction manual says about bts that ,

"Copies a bit, specified by bit index in a register or 8-bit immediate 
value (second operand), from a bit
string (first operand), also called the bit base, to the carry flag (CF) 
of the rFLAGS register, and then

sets the bit in the bit string to 1."

BTS instruction is read-modify-write instruction on bit unit. So 
concurrent bit operation on different

bits may be possible.

No matter what actual instruction is used, the SetPageLRU operation (ie.
without the double underscore prefix) must be atomic, and the __SetPageLRU
operation *can* be non-atomic if that would be faster.

As Hugh points out, we must have atomic ops here, so changing the generic
code to use the __ version is wrong. However if there is a faster way that
i386 can perform the atomic variant, then doing so will speed up the generic
code without breaking other architectures.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] change kernel threads to ignore signals instead of blocking them

On Fri, 13 Apr 2007 11:31:16 +0400 Oleg Nesterov <[EMAIL PROTECTED]> wrote:

> On top of Eric's
> 
>   kthread-dont-depend-on-work-queues-take-2.patch
> 
> Currently kernel threads use sigprocmask(SIG_BLOCK) to protect against 
> signals.
> This doesn't prevent the signal delivery, this only blocks signal_wake_up().
> Every "killall -33 kthreadd" means a "struct siginfo" leak.
> 
> Change kthreadd_setup() to set all handlers to SIG_IGN instead of blocking 
> them
> (make a new helper ignore_signals() for that). If the kernel thread needs some
> signal, it should use allow_signal() anyway, and in that case it should not 
> use
> CLONE_SIGHAND.
> 
> Note that we can't change daemonize() (should die!) in the same way, because
> it can be used along with CLONE_SIGHAND. This means that allow_signal() still
> should unblock the signal to work correctly with daemonize()ed threads.
> 
> However, disallow_signal() doesn't block the signal any longer but ignores it.
> 
> NOTE: with or without this patch the kernel threads are not protected from
> handle_stop_signal(), this seems harmless, but not good.

I'm seeing 500 zombied instances of khelper (from udev startup).  It only
happens when the utrace patches are applied.  Presumably an interaction
between utrace and one of these kthread changes.

I'll drop utrace for now.  I don't think it's getting much help from being
in -mm at present and it's getting increasingly painful to keep it merged
against all the other stuff which is happening.

Roland, I'll squirt all the extra utrace patches which I have in your direction.
Please merge them or hang on to them for later on.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

On Mon, 2007-04-23 at 20:08 -0600, Eric W. Biederman wrote:
> Benjamin Herrenschmidt <[EMAIL PROTECTED]> writes:
> 
> >> The only reason for using threads here is to get the error recovery
> >> out of an interrupt context (where errors may be detected), and then,
> >> an hour later, decrement a counter (which is how we limit these to 
> >> 6 per hour). Thread reaping is "trivial", the thread just exits
> >> after an hour.
> >
> > In addition, it should be a thread and not done from within keventd
> > because :
> >
> >  - It can take a long time (well, relatively but still too long for a
> > work queue)
> >
> >  - The driver callbacks might need to use keventd or do flush_workqueue
> > to synchronize with their own workqueues when doing an internal
> > recovery.
> >
> >> Since these are events rare, I've no particular concern about
> >> performance or resource consumption. The current code seems 
> >> to work just fine. :-)
> >
> > I think moving to kthread's is cleaner (just a wrapper around kernel
> > threads that simplify dealing with reaping them out mostly) and I agree
> > with Christoph that it would be nice to be able to "fire off" kthreads
> > from interrupt context.. in many cases, we abuse work queues for things
> > that should really done from kthreads instead (basically anything that
> > takes more than a couple hundred microsecs or so).
> 
> On that note does anyone have a problem is we manage the irq spawning
> safe kthreads the same way that we manage the work queue entries.
> 
> i.e. by a structure allocated by the caller?

Not sure... I can see places where I might want to spawn an arbitrary
number of these without having to preallocate structures... and if I
allocate on the fly, then I need a way to free that structure when the
kthread is reaped which I don't think we have currently, do we ? (In
fact, I could use that for other things too now that I'm thinking of
it ... I might have a go at providing optional kthread destructors).

Ben.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ANNOUNCE][PATCH] Kcli - Kernel command line interface.

2007-04-23 Thread Satyam Sharma

Hi Matt,

On 4/24/07, Matt Ranon <[EMAIL PROTECTED]> wrote:

> The obvious question is: what's _wrong_ with doing all this in some
> cut-down userspace environment like busybox?  Why is this stuff better?
>
> Obviously some embedded developers have considered that option and
> have rejected it.  But we do need to be told, at length, why that
> decision was made.

There is nothing _wrong_ with doing it all in a cut-down userspace. It
is a matter of personal preference, culture, and the application. That
is what makes Linux so great, it is all about choice.

We are developing devices that don't have a user space, and we don't
see the point in including one just for debug purposes. We will not be
offended if Kcli is not included into the kernel mainline, nor if Kcli compels
people to call us stupid (as it already has) just because we are different
and some people don't understand us. We are firm believers that the
world, including the Linux kernel world,  would be a nasty place if there
was only _one_ way to do any given task. Additionally, we  are almost
certain that there will be others who think like we do, so we are reaching
out to them. We also feel compelled to give _something_ back to the
community that has given so much to us, and, for now, this is all we have.

I'm afraid you might've misunderstood the (rather caustic, sometimes)
general nature of comments on lkml :-) But I guess you only have
everything to gain if you use features that have been developed (and
are being *maintained* in the current kernel) that already do the kind
of stuff you want done.

You might have your reasons for being so anxious to avoid any
userspace at all, but quoting famous words, continuing to maintain
Kcli out-of-tree could soon turn out to be an act for
self-flagellation for you :-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: PageLRU can be non-atomic bit operation

2007-04-23 Thread KAMEZAWA Hiroyuki

On Tue, 24 Apr 2007 10:54:27 +0900
Hisashi Hifumi <[EMAIL PROTECTED]> wrote:
> In the case that changing the same bit concurrently, lock prefix or other
> spinlock is needed. But, I think that concurrent bit operation on different 
> bits
> is just like OR operation , so lock prefix is not needed.
> 
> AMD instruction manual says about bts that ,
> 
> "Copies a bit, specified by bit index in a register or 8-bit immediate 
> value (second operand), from a bit
> string (first operand), also called the bit base, to the carry flag (CF) of 
> the rFLAGS register, and then
> sets the bit in the bit string to 1."
> 
> BTS instruction is read-modify-write instruction on bit unit. So concurrent 
> bit operation on different
> bits may be possible.
> 
This is ia64's __set_bit() hehe..
==
static __inline__ void
__set_bit (int nr, volatile void *addr)
{
*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
}
==

Bye.
-Kame

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Remove open coded implementations of memclear_highpage flush

On Tue, 24 Apr 2007, Satyam Sharma wrote:

> On 4/24/07, Christoph Lameter <[EMAIL PROTECTED]> wrote:
> > There are a series of open coded reimplementation of memclear_highpage_flush
> > all over the page cache code. Call memclear_highpage_flush in those
> > locations.
> > Consolidates code and eases maintenance.
> 
> If I remember right, a very similar patchset was recently submitted
> that Andrew merged in -mm(?). It also renamed memclear_highpage_flush
> to something like zero_user_page (though I wonder how good a name that
> is considering it takes an offset and not the whole page) and
> deprecated the old name.

My latest tree from Andrew does not have any of this. URL of patch?
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Remove open coded implementations of memclear_highpage flush

On Tue, 24 Apr 2007 07:49:45 +0530 "Satyam Sharma" <[EMAIL PROTECTED]> wrote:

> On 4/24/07, Christoph Lameter <[EMAIL PROTECTED]> wrote:
> > There are a series of open coded reimplementation of memclear_highpage_flush
> > all over the page cache code. Call memclear_highpage_flush in those 
> > locations.
> > Consolidates code and eases maintenance.
> 
> If I remember right, a very similar patchset was recently submitted
> that Andrew merged in -mm(?).

yup.

> It also renamed memclear_highpage_flush
> to something like zero_user_page (though I wonder how good a name that
> is considering it takes an offset and not the whole page)

It's not a great name, but the fact that you must provide it with `offset'
and `length' arguments rather clears up any confusion ;)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Remove open coded implementations of memclear_highpage flush

2007-04-23 Thread Satyam Sharma


On 4/24/07, Christoph Lameter <[EMAIL PROTECTED]> wrote:

There are a series of open coded reimplementation of memclear_highpage_flush
all over the page cache code. Call memclear_highpage_flush in those locations.
Consolidates code and eases maintenance.


If I remember right, a very similar patchset was recently submitted
that Andrew merged in -mm(?). It also renamed memclear_highpage_flush
to something like zero_user_page (though I wonder how good a name that
is considering it takes an offset and not the whole page) and
deprecated the old name.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Nick Piggin


Rik van Riel wrote:

This should fix the MADV_FREE code for PPC's hashed tlb.

Signed-off-by: Rik van Riel <[EMAIL PROTECTED]>
---

Nick Piggin wrote:


Nick Piggin wrote:


3) because of this, we can treat any such accesses as
   happening simultaneously with the MADV_FREE and
   as illegal, aka undefined behaviour territory and
   we do not need to worry about them




Yes, but I'm wondering if it is legal in all architectures.




It's similar to trying to access memory during an munmap.

You may be able to for a short time, but it'll come back to
haunt you.



The question is whether the architecture specific tlb
flushing code will break or not.



I guess we'll need to call tlb_remove_tlb_entry() inside the
MADV_FREE code to keep powerpc happy.

Thanks for pointing this one out.


Even then we do.  Each invocation of zap_pte_range() only touches
one page table page, and it flushes the TLB before releasing the
page table lock.



What kernel are you looking at? -rc7 and rc6-mm1 don't, AFAIKS.



Oh dear.  I see it now...

The tlb end things inside zap_pte_range() are actually
noops and the actual tlb flush only happens inside
zap_page_range().

I guess the fact that munmap gets the mmap_sem for
writing should save us, though...


What about an unmap_mapping_range, or another MADV_FREE or
MADV_DONTNEED?






--- linux-2.6.20.x86_64/mm/memory.c.noppc   2007-04-23 21:50:09.0 
-0400
+++ linux-2.6.20.x86_64/mm/memory.c 2007-04-23 21:48:59.0 -0400
@@ -679,6 +679,7 @@ static unsigned long zap_pte_range(struc
}
ptep_test_and_clear_dirty(vma, addr, 
pte);
ptep_test_and_clear_young(vma, addr, 
pte);
+   tlb_remove_tlb_entry(tlb, pte, addr);
SetPageLazyFree(page);
if (PageActive(page))
deactivate_tail_page(page);



--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API

Benjamin Herrenschmidt <[EMAIL PROTECTED]> writes:

>> The only reason for using threads here is to get the error recovery
>> out of an interrupt context (where errors may be detected), and then,
>> an hour later, decrement a counter (which is how we limit these to 
>> 6 per hour). Thread reaping is "trivial", the thread just exits
>> after an hour.
>
> In addition, it should be a thread and not done from within keventd
> because :
>
>  - It can take a long time (well, relatively but still too long for a
> work queue)
>
>  - The driver callbacks might need to use keventd or do flush_workqueue
> to synchronize with their own workqueues when doing an internal
> recovery.
>
>> Since these are events rare, I've no particular concern about
>> performance or resource consumption. The current code seems 
>> to work just fine. :-)
>
> I think moving to kthread's is cleaner (just a wrapper around kernel
> threads that simplify dealing with reaping them out mostly) and I agree
> with Christoph that it would be nice to be able to "fire off" kthreads
> from interrupt context.. in many cases, we abuse work queues for things
> that should really done from kthreads instead (basically anything that
> takes more than a couple hundred microsecs or so).

On that note does anyone have a problem is we manage the irq spawning
safe kthreads the same way that we manage the work queue entries.

i.e. by a structure allocated by the caller?

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: AppArmor FAQ

2007-04-23 Thread Joshua Brindle


Crispin Cowan wrote:

David Wagner wrote:
  

James Morris  wrote:
  

[...] you can change the behavior of the application and then bypass 
policy entirely by utilizing any mechanism other than direct filesystem 
access: IPC, shared memory, Unix domain sockets, local IP networking, 
remote networking etc.

  

[...]
  


Just look at their code and their own description of AppArmor.

  

My gosh, you're right.  What the heck?  With all due respect to the
developers of AppArmor, I can't help thinking that that's pretty lame.
I think this raises substantial questions about the value of AppArmor.
What is the point of having a jail if it leaves gaping holes that
malicious code could use to escape?

And why isn't this documented clearly, with the implications fully
explained?

I would like to hear the AppArmor developers defend this design decision.
  


It was a simplicity trade off at the time, when AppArmor was mostly
aimed at servers, and there was no HAL or DBUS. Now it is definitely a
limitation that we are addressing. We are working on a mediation system
for what kind of IPC a confined process can do
http://forge.novell.com/pipermail/apparmor-dev/2007-April/000503.html

  
Except servers use IPC and need this access control as well. Without IPC 
and network restrictions you can't protect database servers, ldap 
servers, print servers, ssh agents, virus scanning servers, spam 
scanning servers, etc from attackers with knowledge of how to abuse the IPC.

When our IPC mediation system is code instead of vapor, it will also
appear here for review. Meanwhile, AppArmor does not make IPC security
any worse, confined processes are still subject to the usual Linux IPC
restrictions. AppArmor actually makes the IPC situation somewhat more
secure than stock Linux, e.g. normal DBUS deployment can be controlled
through file access permissions. But we are not claiming AppArmor to be
an IPC security enhancement, yet.
  
Without a security interface in DBUS similar to SELinux' apparmor won't 
be able to control who can talk to who across DBUS, only who can connect 
to DBUS directly.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-23 Thread hui

On Mon, Apr 23, 2007 at 05:59:06PM -0700, Li, Tong N wrote:
> I don't know if we've discussed this or not. Since both CFS and SD claim
> to be fair, I'd like to hear more opinions on the fairness aspect of
> these designs. In areas such as OS, networking, and real-time, fairness,
> and its more general form, proportional fairness, are well-defined
> terms. In fact, perfect fairness is not feasible since it requires all
> runnable threads to be running simultaneously and scheduled with
> infinitesimally small quanta (like a fluid system). So to evaluate if a

Unfortunately, fairness is rather non-formal in this context and probably
isn't strictly desirable given how hack much of Linux userspace is. Until
there's a method of doing directed yields, like what Will has prescribed
a kind of allotment to thread doing work for another a completely strict
mechanism, it is probably problematic with regards to corner cases.

X for example is largely non-thread safe. Until they can get their xcb
framework in place and addition thread infrastructure to do hand off
properly, it's going to be difficult schedule for it. It's well known to
be problematic.

You announced your scheduler without CCing any of the relevant people here
(and risk being completely ignored in lkml traffic):

http://lkml.org/lkml/2007/4/20/286

What is your opinion of both CFS and SDL ? How can you work be useful
to either scheduler mentioned or to the Linux kernel on its own ?

> I understand that via experiments we can show a design is reasonably
> fair in the common case, but IMHO, to claim that a design is fair, there
> needs to be some kind of formal analysis on the fairness bound, and this
> bound should be proven to be constant. Even if the bound is not
> constant, at least this analysis can help us better understand and
> predict the degree of fairness that users would experience (e.g., would
> the system be less fair if the number of threads increases? What happens
> if a large number of threads dynamically join and leave the system?).

Will has been thinking about this, but you have to also consider the
practicalities of your approach versus Con's and Ingo's.

I'm all for things like proportional scheduling and the extensions
needed to do it properly. It would be highly relevant to some version
of the -rt patch if not that patch directly.

bill

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Rik van Riel


This should fix the MADV_FREE code for PPC's hashed tlb.

Signed-off-by: Rik van Riel <[EMAIL PROTECTED]>
---

Nick Piggin wrote:

Nick Piggin wrote:


3) because of this, we can treat any such accesses as
   happening simultaneously with the MADV_FREE and
   as illegal, aka undefined behaviour territory and
   we do not need to worry about them



Yes, but I'm wondering if it is legal in all architectures.



It's similar to trying to access memory during an munmap.

You may be able to for a short time, but it'll come back to
haunt you.


The question is whether the architecture specific tlb
flushing code will break or not.


I guess we'll need to call tlb_remove_tlb_entry() inside the
MADV_FREE code to keep powerpc happy.

Thanks for pointing this one out.


Even then we do.  Each invocation of zap_pte_range() only touches
one page table page, and it flushes the TLB before releasing the
page table lock.


What kernel are you looking at? -rc7 and rc6-mm1 don't, AFAIKS.


Oh dear.  I see it now...

The tlb end things inside zap_pte_range() are actually
noops and the actual tlb flush only happens inside
zap_page_range().

I guess the fact that munmap gets the mmap_sem for
writing should save us, though...

--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.
--- linux-2.6.20.x86_64/mm/memory.c.noppc	2007-04-23 21:50:09.0 -0400
+++ linux-2.6.20.x86_64/mm/memory.c	2007-04-23 21:48:59.0 -0400
@@ -679,6 +679,7 @@ static unsigned long zap_pte_range(struc
 	}
 	ptep_test_and_clear_dirty(vma, addr, pte);
 	ptep_test_and_clear_young(vma, addr, pte);
+	tlb_remove_tlb_entry(tlb, pte, addr);
 	SetPageLazyFree(page);
 	if (PageActive(page))
 		deactivate_tail_page(page);

Re: [PATCH] mm: PageLRU can be non-atomic bit operation

2007-04-23 Thread Hisashi Hifumi



At 22:42 07/04/23, Hugh Dickins wrote:
>On Mon, 23 Apr 2007, Hisashi Hifumi wrote:
>> >No.  The PG_lru flag bit is just one bit amongst many others:
>> >what of concurrent operations changing other bits in that same
>> >unsigned long e.g. trying to lock the page by setting PG_locked?
>> >There are some places where such micro-optimizations can be made
>> >(typically while first allocating the page); but in general, no.
>>
>> In i386 and x86_64, btsl is used to change page flag. In this case, if btsl
>> without lock prefix
>> set PG_locked and PG_lru flag concurrently, does only one operation
>> succeed ?
>
>That's right: on an SMP machine, without the lock prefix, the operation
>is no longer atomic: what's stored back may be missing the result of
>one or the other of the racing operations.
>

In the case that changing the same bit concurrently, lock prefix or other
spinlock is needed. But, I think that concurrent bit operation on different 
bits

is just like OR operation , so lock prefix is not needed.

AMD instruction manual says about bts that ,

"Copies a bit, specified by bit index in a register or 8-bit immediate 
value (second operand), from a bit
string (first operand), also called the bit base, to the carry flag (CF) of 
the rFLAGS register, and then

sets the bit in the bit string to 1."

BTS instruction is read-modify-write instruction on bit unit. So concurrent 
bit operation on different

bits may be possible.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [report] renicing X, cfs-v5 vs sd-0.46

2007-04-23 Thread Gene Heskett

On Monday 23 April 2007, Niel Lambrechts wrote:
>Gene Heskett wrote:
>> This message prompted me to do some checking in re context switches
>> myself, and I've come to the conclusion that there could be a bug in
>> vmstat itself.
>
>Perhaps. perhaps not. :)
>
>> Run singly the context switching is reasonable even for a -19 niceness of
>> x, its only showing about 200 or so on the first loop of vmstat.  But
>> throw in the -n 1 arguments and it goes crazy on the second and subsequent
>> loops.
>
>man vmstat:
>"The first report produced gives averages since the last reboot.
>Additional reports  give information on a sampling period of length delay."

I missed that, concentrating on finding the method of telling it the delay I 
guess.

So then the next question is, over what period is that obviously lower figure 
being averaged over?  Certainly not over a 1 second period else it would then 
be much higher, as seen by the figures after the initial delay.  The time 
slice spec'd in /proc/sys/kernel/sched_granularity_ns, which here is 
currently 500 or 5 milliseconds?  If that was the case, the first answer 
would be in the area of 15, not 200.

So educate me, off list if you would like and have the time.

Thanks Niel.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Sweet sixteen is beautiful Bess,
And her voice is changing -- from "No" to "Yes".
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

PROBLEM: Oops: 0002 [1] SMP

2007-04-23 Thread Thiago M.


[1] Summary:

Kernel Reports Oops: 0002 [1] SMP and the system becomes unstable

[2] Full Description:

Sometimes, randomly i get this Oops message and the system becomes
unstable. By unstable i mean all applications segmentation faults when i
execute (after the Oops). Sometimes X crashes, sometimes the machine
just reboots (the reboot might be other problem tho).

This happens with kernel 2.6.20 and with 2.6.21-rc7. Was happening with
2.6.20 so i tried 2.6.21-rc7 and this also happens.

[EMAIL PROTECTED]:/var/log$ uname -a
Linux sayao-desktop 2.6.21-rc7-sayao #2 SMP Mon Apr 16 22:11:36 BRT 2007
x86_64 GNU/Linux

Here is the log:

Apr 22 21:44:33 sayao-desktop kernel: [18641.553890] Unable to handle
kernel paging request at 3e82 RIP:
Apr 22 21:44:33 sayao-desktop kernel: [18641.553899]  [__alloc_skb
+188/321] __alloc_skb+0xbc/0x141
Apr 22 21:44:33 sayao-desktop kernel: [18641.553911] PGD 203027 PUD 0
Apr 22 21:44:33 sayao-desktop kernel: [18641.553915] Oops: 0002 [1] SMP
Apr 22 21:44:33 sayao-desktop kernel: [18641.553919] CPU 0
Apr 22 21:44:33 sayao-desktop kernel: [18641.553922] Modules linked in:
binfmt_misc rfcomm l2cap bluetooth i915 drm ppdev capability commoncap
acpi_cpufreq cpufreq_userspace cpufreq_stats cpufreq_conservative
cpufreq_ondemand cpufreq_powersave freq_table asus_acpi container sbs
i2c_ec i2c_core battery video dock ac button ipv6 lp fuse snd_hda_intel
snd_hda_codec snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy
snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq psmouse
snd_timer snd_seq_device snd parport_pc parport shpchp serio_raw pcspkr
soundcore snd_page_alloc iTCO_wdt iTCO_vendor_support pci_hotplug
intel_agp af_packet evdev tsdev ext3 jbd mbcache sg ide_cd cdrom sd_mod
ata_generic usbhid hid ata_piix libata scsi_mod e100 mii ehci_hcd
generic piix uhci_hcd usbcore thermal processor fan
Apr 22 21:44:33 sayao-desktop kernel: [18641.553997] Pid: 13805, comm:
evolution Not tainted 2.6.21-rc7-sayao #2
Apr 22 21:44:33 sayao-desktop kernel: [18641.554001] RIP:
0010:[__alloc_skb+188/321]  [__alloc_skb+188/321] __alloc_skb+0xbc/0x141
Apr 22 21:44:33 sayao-desktop kernel: [18641.554007] RSP:
0018:810033169bd8  EFLAGS: 00010246
Apr 22 21:44:33 sayao-desktop kernel: [18641.554011] RAX:
3e82 RBX: 0002 RCX: 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554014] RDX:
 RSI:  RDI: 81003b1bfa50
Apr 22 21:44:33 sayao-desktop kernel: [18641.554017] RBP:
3e80 R08: 0002 R09: 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554021] R10:
81003b1bf980 R11: 00d0 R12: 81003b1bf980
Apr 22 21:44:33 sayao-desktop kernel: [18641.554024] R13:
81003f2109c0 R14: 04d0 R15: 3e80
Apr 22 21:44:33 sayao-desktop kernel: [18641.554028] FS:
2ad334669ea0() GS:8052f000() knlGS:
Apr 22 21:44:33 sayao-desktop kernel: [18641.554032] CS:  0010 DS: 
ES:  CR0: 80050033
Apr 22 21:44:33 sayao-desktop kernel: [18641.554035] CR2:
3e82 CR3: 27bb3000 CR4: 06e0
Apr 22 21:44:33 sayao-desktop kernel: [18641.554039] Process evolution
(pid: 13805, threadinfo 810033168000, task 810009665000)
Apr 22 21:44:33 sayao-desktop kernel: [18641.554042] Stack:
09665000 81002f9f5080 3e80 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554049]  04d0
810033169ce4 3e80 803a6d82
Apr 22 21:44:33 sayao-desktop kernel: [18641.554055]  
0206 80507110 81dadc50
Apr 22 21:44:33 sayao-desktop kernel: [18641.554061] Call Trace:
Apr 22 21:44:33 sayao-desktop kernel: [18641.554086]
[sock_alloc_send_skb+130/478] sock_alloc_send_skb+0x82/0x1de
Apr 22 21:44:33 sayao-desktop kernel: [18641.554126]
[unix_stream_sendmsg+392/880] unix_stream_sendmsg+0x188/0x370
Apr 22 21:44:33 sayao-desktop kernel: [18641.554181]  [sock_aio_write
+293/313] sock_aio_write+0x125/0x139
Apr 22 21:44:33 sayao-desktop kernel: [18641.554247]  [do_sync_write
+207/277] do_sync_write+0xcf/0x115
Apr 22 21:44:33 sayao-desktop kernel: [18641.554287]
[autoremove_wake_function+0/48] autoremove_wake_function+0x0/0x30
Apr 22 21:44:33 sayao-desktop kernel: [18641.554352]  [vfs_write
+228/348] vfs_write+0xe4/0x15c
Apr 22 21:44:33 sayao-desktop kernel: [18641.554369]  [sys_write+69/121]
sys_write+0x45/0x79
Apr 22 21:44:33 sayao-desktop kernel: [18641.554393]  [system_call
+126/131] system_call+0x7e/0x83
Apr 22 21:44:33 sayao-desktop kernel: [18641.554434] 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554436] 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554437] Code: c7 00 01 00
00 00 66 c7 40 04 00 00 66 c7 40 06 00 00 66 c7 
Apr 22 21:44:33 sayao-desktop kernel: [18641.554453] RIP  [__alloc_skb
+188/321] __alloc_skb+0xbc/0x141
Apr 22 21:44:33 sayao-desktop kernel: [18641.554458]  RSP

Apr 22 21:44:33

Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

On Mon, Apr 23, 2007 at 02:57:00PM -0700, Jeremy Fitzhardinge wrote:
> Netfront's use of nh.raw and h.raw for storing page+offset is a bit
> hinky, and it breaks with upcoming network stack updates which reduce
> these fields to sub-pointer sizes.  Fortunately, skb offers the "cb"
> field specifically for stashing this kind of info, so use it.
> 
> Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
> Cc: Herbert Xu <[EMAIL PROTECTED]>
> Cc: Chris Wright <[EMAIL PROTECTED]>
> Cc: Christian Limpach <[EMAIL PROTECTED]>

Thanks Jeremy.  The patch looks good.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] powerpc pseries eeh: Convert to kthread API


> The only reason for using threads here is to get the error recovery
> out of an interrupt context (where errors may be detected), and then,
> an hour later, decrement a counter (which is how we limit these to 
> 6 per hour). Thread reaping is "trivial", the thread just exits
> after an hour.

In addition, it should be a thread and not done from within keventd
because :

 - It can take a long time (well, relatively but still too long for a
work queue)

 - The driver callbacks might need to use keventd or do flush_workqueue
to synchronize with their own workqueues when doing an internal
recovery.

> Since these are events rare, I've no particular concern about
> performance or resource consumption. The current code seems 
> to work just fine. :-)

I think moving to kthread's is cleaner (just a wrapper around kernel
threads that simplify dealing with reaping them out mostly) and I agree
with Christoph that it would be nice to be able to "fire off" kthreads
from interrupt context.. in many cases, we abuse work queues for things
that should really done from kthreads instead (basically anything that
takes more than a couple hundred microsecs or so).

Ben.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: ChunkFS - measuring cross-chunk references

2007-04-23 Thread Amit Gud

On Mon, 23 Apr 2007, Amit Gud wrote:

On Mon, 23 Apr 2007, Arjan van de Ven wrote:

>  The other thing which we should consider is that chunkfs really
>  requires a 64-bit inode number space, which means either we only allow

 does it?
 I'd think it needs a "chunk space" number and a 32 bit local inode
 number ;) (same for blocks)

For inodes, yes, either 64-bit inode or some field for the chunk id in which 
the inode is. But for block numbers, you don't. Because individual chunks 
manage part of the whole file system in an independent way. They have their 
block bitmaps starting at an offset. Inode bitmaps, however, remains same.

In that sense, we also can do away without having chunk identifier encoded 
into inode number and chunkfs would still be fine with it. But we will 
then loose inode uniqueness property, which could well be OK as it is with 
other file systems in which inode number is not sufficient for unique 
identification of an inode.

AG
--
May the source be with you.
http://www.cis.ksu.edu/~gud
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 03/25] xen: Add nosegneg capability to the vsyscall page notes

2007-04-23 Thread Jeremy Fitzhardinge

Roland McGrath wrote:
>> + * It should contain:
>> + *  hwcap 0 nosegneg
>> + * to match the mapping of bit to name that we give here.
>> 
>
> This needs to be "hwcap 0 nosegneg" to match:
>
>   
>> +NOTE_KERNELCAP_BEGIN(1, 2)
>> +NOTE_KERNELCAP(1, "nosegneg")
>> +NOTE_KERNELCAP_END
>> 
>
> The actual bits you are using should be fine.  (You're intentionally
> skipping bit 0 to work around hold glibc bugs, which you might want to add
> to the comments.  Also a comment or perhaps using 1<<1 syntax would make it
> more clear that "2" is the bit mask containing bit 1 and that's why it has
> to be 2, and not because of some other magical property of 2.)  But if
> kernel packagers don't write the matching bit number in their ld.so.conf.d
> files, then ld.so.cache lookups won't work right.

I have to admit I still don't really understand all this.  Is it
documented somewhere?

What does "hwcap 0 nosegneg" actually mean?  What does the "0" mean here?

In the ELF note, what does the "nosegneg" string mean?  How is it used? 
Is it compared to the "nosegneg" in ld.so.conf?  How does this relate to
the bitfields?

Thanks,
J
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Update the list information for kexec and kdump

Simon Horman <[EMAIL PROTECTED]> writes:

> On Mon, Apr 23, 2007 at 12:04:01PM -0600, Eric W. Biederman wrote:
>> Simon Horman <[EMAIL PROTECTED]> writes:
>> 
>> > Update the list information for kexec and kdump
>> >
>> > Signed-off-by: Simon Horman <[EMAIL PROTECTED]>
>> >
>> > --- 
>> > Is it too early for this change?
>> 
>> It looks like the new list is working, and isn't likely to get overwhelmed
>> with spam.  I don't know if everyone has switched over yet but we can
>> certainly update MAINTAINERS. 
>
> Last time I checked there were 28 people in the kexec@ list.
> This isn't everyone, but it is getting there.
>
> May I add an Acked-by you ?

Sure.

Eric
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 16/25] xen: Use the hvc console infrastructure for Xen console

2007-04-23 Thread Olof Johansson

On Mon, Apr 23, 2007 at 02:56:54PM -0700, Jeremy Fitzhardinge wrote:
> Implement a Xen back-end for hvc console.
> 
> From: Gerd Hoffmann <[EMAIL PROTECTED]>
> Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
> 
> ---
>  arch/i386/xen/Kconfig |1 
>  arch/i386/xen/events.c|3 -
>  drivers/Makefile  |3 +
>  drivers/xen/Makefile  |1 
>  drivers/xen/hvc-console.c |  134 
> +
>  include/xen/events.h  |1 
>  6 files changed, 142 insertions(+), 1 deletion(-)

If you move the driver to drivers/char/hvc_xen.c instead, you won't have to 
do...

> +#include "../char/hvc_console.h"

...this.

Other single-platform backend hvc drivers are under drivers/char already.


-Olof
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH try #2] Return access error not ECHILD on security_task_wait failure

2007-04-23 Thread James Morris

From: Roland McGrath <[EMAIL PROTECTED]>

wait* syscalls return -ECHILD even when an individual PID of a live child
was requested explicitly, when security_task_wait denies the operation.
This means that something like a broken SELinux policy can produce an
unexpected failure that looks just like a bug with wait or ptrace or
something.

This patch makes do_wait return -EACCES (or other appropriate
error returned from security_task_wait() instead of -ECHILD if some
children were ruled out solely because security_task_wait failed.

Signed-off-by: James Morris <[EMAIL PROTECTED]>
---

Updated version, returns value from security_task_wait().


 kernel/exit.c |   17 +++--
 1 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b55ed4c..9236924 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1033,6 +1033,8 @@ asmlinkage void sys_exit_group(int error_code)
 
 static int eligible_child(pid_t pid, int options, struct task_struct *p)
 {
+   int err;
+
if (pid > 0) {
if (p->pid != pid)
return 0;
@@ -1066,8 +1068,9 @@ static int eligible_child(pid_t pid, int options, struct 
task_struct *p)
if (delay_group_leader(p))
return 2;
 
-   if (security_task_wait(p))
-   return 0;
+   err = security_task_wait(p);
+   if (err)
+   return err;
 
return 1;
 }
@@ -1449,6 +1452,7 @@ static long do_wait(pid_t pid, int options, struct 
siginfo __user *infop,
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int flag, retval;
+   int allowed, denied;
 
add_wait_queue(>signal->wait_chldexit,);
 repeat:
@@ -1457,6 +1461,7 @@ repeat:
 * match our criteria, even if we are not able to reap it yet.
 */
flag = 0;
+   allowed = denied = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(_lock);
tsk = current;
@@ -1472,6 +1477,12 @@ repeat:
if (!ret)
continue;
 
+   if (unlikely(ret < 0)) {
+   denied = ret;
+   continue;
+   }
+   allowed = 1;
+
switch (p->state) {
case TASK_TRACED:
/*
@@ -1570,6 +1581,8 @@ check_continued:
goto repeat;
}
retval = -ECHILD;
+   if (unlikely(denied) && !allowed)
+   retval = denied;
 end:
current->state = TASK_RUNNING;
remove_wait_queue(>signal->wait_chldexit,);
-- 
1.5.0.6

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Question about Reiser4

2007-04-23 Thread Theodore Tso

On Mon, Apr 23, 2007 at 05:31:29PM -0700, H. Peter Anvin wrote:
> Heh.  sys_read_tree() -- walk a directory tree and return it as a data 
> structure in memory :)

But maybe you don't want every single file in the directory, but some
subset of the files in the directory tree.  So before you know it:

sys_fs_sql("SELECT port,userid,daemon FROM /etc/inetd.conf.d "
"WHERE protocol=='tcp'", buf, sizeof(buf));

The question is where do you stop on the slippery slope, and is it
really all that harder than simply parsing a /etc/gitconfig or
/etc/e2fsck.conf file.  There are plenty of parsers or database
libraries already written, and many of them are quite efficient.  And
personally, I'd much rather edit a single /etc/gitconfig or
/etc/e2fsck.conf file using emacs than have to cd through 3 or 4
levels of directories to edit each 2-3 byte file one at a time.  But
to each their own

- Ted
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] lazy freeing of memory through MADV_FREE

2007-04-23 Thread Nick Piggin


Rik van Riel wrote:

Use TLB batching for MADV_FREE.  Adds another 10-15% extra performance
to the MySQL sysbench results on my quad core system.

Signed-off-by: Rik van Riel <[EMAIL PROTECTED]>
---

Nick Piggin wrote:


3) because of this, we can treat any such accesses as
   happening simultaneously with the MADV_FREE and
   as illegal, aka undefined behaviour territory and
   we do not need to worry about them



Yes, but I'm wondering if it is legal in all architectures.



It's similar to trying to access memory during an munmap.

You may be able to for a short time, but it'll come back to
haunt you.


The question is whether the architecture specific tlb
flushing code will break or not.



4) because we flush the tlb before releasing the page
   table lock, other CPUs cannot remove this page from
   the address space - they will block on the page
   table lock before looking at this pte



We don't when the ptl is split.



Even then we do.  Each invocation of zap_pte_range() only touches
one page table page, and it flushes the TLB before releasing the
page table lock.


What kernel are you looking at? -rc7 and rc6-mm1 don't, AFAIKS.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ANNOUNCE][PATCH] Kcli - Kernel command line interface.

2007-04-23 Thread Matt Ranon

> (text reformatted to less than 80 cols.  Please, we'll get along a lot
> better if you don't send 1000-column emails)

Sorry. I am afraid we are from a different background, and so very
poorly versed in these things. My email client does not seem
to have an option to tell it to format in 80 cols. So, hopefully,
using CR, I am achieving the same effect. Let me know if
it doesn't work, and I will have to switch to a different email
client for conversing with the lkml.


> The obvious question is: what's _wrong_ with doing all this in some
> cut-down userspace environment like busybox?  Why is this stuff better?
> 
> Obviously some embedded developers have considered that option and
> have rejected it.  But we do need to be told, at length, why that
> decision was made.

There is nothing _wrong_ with doing it all in a cut-down userspace. It
is a matter of personal preference, culture, and the application. That
is what makes Linux so great, it is all about choice.

We are developing devices that don't have a user space, and we don't
see the point in including one just for debug purposes. We will not be 
offended if Kcli is not included into the kernel mainline, nor if Kcli compels
people to call us stupid (as it already has) just because we are different 
and some people don't understand us. We are firm believers that the 
world, including the Linux kernel world,  would be a nasty place if there 
was only _one_ way to do any given task. Additionally, we  are almost 
certain that there will be others who think like we do, so we are reaching 
out to them. We also feel compelled to give _something_ back to the 
community that has given so much to us, and, for now, this is all we have.

However, our reasons for Kcli are:
1) Our devices ship with no user space, and we want the development
environment to be as close as possible to the final product.
2) Getting debug information with user space calls require context 
switches and data copies, which changes the real time profile and can mask
bugs. 
3) To use user space, we would need cross compiled libc's, special
builds of gcc, root file systems, flash storage to store it all, and all 
sorts of things which make life a lot more complicated than it needs 
to be for us. We are quite capable of producing all these things, but,
we just don't see the point in it. Our way, we just have a gcc capable 
of cross compiling the kernel and it is so simple.
4) For us, it is the opposite argument. We would need to be convinced
that having user space is worth all the overhead. Not just CPU
overhead, but all the overheads.
5) We like it in the kernel, we find it to be warm and fuzzy. Whereas,
user space is a cold, dark, and rainy place, and we just don't want to
go there. :)

We do not claim to have come up with a _better_ way. We have just
created something that we feel would be useful to others.

MRanon.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: SLUB: kmem_cache_destroy doesn't - version 2.

On Tue, 24 Apr 2007, Neil Brown wrote:

> kobject_set_name actually takes a format and arbitrary args and uses
> vsnprintf, so it has to make it's own copy.

Ok then this should be fine...

SLAB: Fix sysfs directory handling

This fixes the problem that SLUB does not track the names of aliased
slabs by changing the way that SLUB manages the files in /sys/slab.

If the slab that is being operated on is not mergeable (usually the
case if we are debugging) then do not create any aliases. If an alias
exists that we conflict with then remove it before creating the
directory for the unmergeable slab. If there is a true slab cache there
and not an alias then we fail since there is a true duplication of
slab cache names. So debugging allows the detection of slab name
duplication as usual.

If the slab is mergeable then we create a directory with a unique name
created from the slab size, slab options and the pointer to the kmem_cache
structure (disambiguation). All names referring to the slabs will
then be created as symlinks to that unique name. These symlinks are
not going to be removed on kmem_cache_destroy() since we only carry
a counter for the number of aliases. If a new symlink is created
then it may just replace an existing one. This means that one can create
a gazillion slabs with the same name (if they all refer to mergeable
caches). It will only increase the alias count. So we have the potential
of not detecting duplicate slab names (there is actually no harm
done by doing that). We will detect the duplications as
as soon as debugging is enabled because we will then no longer
generate symlinks and special unique names.

Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

Index: linux-2.6.21-rc6/mm/slub.c
===
--- linux-2.6.21-rc6.orig/mm/slub.c 2007-04-23 13:08:41.0 -0700
+++ linux-2.6.21-rc6/mm/slub.c  2007-04-23 18:05:16.0 -0700
@@ -3307,16 +3307,68 @@ static struct kset_uevent_ops slab_ueven

 decl_subsys(slab, _ktype, _uevent_ops);

+#define ID_STR_LENGTH 64
+
+/* Create a unique string id for a slab cache:
+ * format
+ * :[flags-]size:[memory address of kmemcache]
+ */
+static char *create_unique_id(struct kmem_cache *s)
+{
+   char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
+   char *p = name;
+
+   BUG_ON(!name);
+
+   *p++ = ':';
+   /*
+* First flags affecting slabcache operations */
+   if (s->flags & SLAB_CACHE_DMA)
+   *p++ = 'd';
+   if (s->flags & SLAB_RECLAIM_ACCOUNT)
+   *p++ = 'a';
+   if (s->flags & SLAB_DESTROY_BY_RCU)
+   *p++ = 'r';\
+   /* Debug flags */
+   if (s->flags & SLAB_RED_ZONE)
+   *p++ = 'Z';
+   if (s->flags & SLAB_POISON)
+   *p++ = 'P';
+   if (s->flags & SLAB_STORE_USER)
+   *p++ = 'U';
+   if (p != name + 1)
+   *p++ = '-';
+   p += sprintf(p,"%07d:0x%p" ,s->size, s);
+   BUG_ON(p > name + ID_STR_LENGTH - 1);
+   return name;
+}
+
 static int sysfs_slab_add(struct kmem_cache *s)
 {
int err;
+   const char *name;

if (slab_state < SYSFS)
/* Defer until later */
return 0;

+   if (s->flags & SLUB_NEVER_MERGE) {
+   /*
+* Slabcache can never be merged so we can use the name proper.
+* This is typically the case for debug situations. In that
+* case we can catch duplicate names easily.
+*/
+   sysfs_remove_link(_subsys.kset.kobj, s->name);
+   name = s->name;
+   } else
+   /*
+* Create a unique name for the slab as a target
+* for the symlinks.
+*/
+   name = create_unique_id(s);
+
kobj_set_kset_s(s, slab_subsys);
-   kobject_set_name(>kobj, s->name);
+   kobject_set_name(>kobj, name);
kobject_init(>kobj);
err = kobject_add(>kobj);
if (err)
@@ -3326,6 +3378,10 @@ static int sysfs_slab_add(struct kmem_ca
if (err)
return err;
kobject_uevent(>kobj, KOBJ_ADD);
+   if (!(s->flags & SLUB_NEVER_MERGE)) {
+   sysfs_slab_alias(s, s->name);
+   kfree(name);
+   }
return 0;
 }

@@ -3351,9 +3407,14 @@ static int sysfs_slab_alias(struct kmem_
 {
struct saved_alias *al;

-   if (slab_state == SYSFS)
+   if (slab_state == SYSFS) {
+   /*
+* If we have a leftover link then remove it.
+*/
+   sysfs_remove_link(_subsys.kset.kobj, name);
return sysfs_create_link(_subsys.kset.kobj,
>kobj, name);
+   }

al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
if (!al)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body

RE: [REPORT] cfs-v4 vs sd-0.44

2007-04-23 Thread Li, Tong N

I don't know if we've discussed this or not. Since both CFS and SD claim
to be fair, I'd like to hear more opinions on the fairness aspect of
these designs. In areas such as OS, networking, and real-time, fairness,
and its more general form, proportional fairness, are well-defined
terms. In fact, perfect fairness is not feasible since it requires all
runnable threads to be running simultaneously and scheduled with
infinitesimally small quanta (like a fluid system). So to evaluate if a
new scheduling algorithm is fair, the common approach is to take the
ideal fair algorithm (often referred to as Generalized Processor
Scheduling or GPS) as a reference model and analyze if the new algorithm
can achieve a constant error bound (different error metrics also exist).
I understand that via experiments we can show a design is reasonably
fair in the common case, but IMHO, to claim that a design is fair, there
needs to be some kind of formal analysis on the fairness bound, and this
bound should be proven to be constant. Even if the bound is not
constant, at least this analysis can help us better understand and
predict the degree of fairness that users would experience (e.g., would
the system be less fair if the number of threads increases? What happens
if a large number of threads dynamically join and leave the system?).

  tong
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: AppArmor FAQ

2007-04-23 Thread Crispin Cowan

David Wagner wrote:
> James Morris  wrote:
>   
>> [...] you can change the behavior of the application and then bypass 
>> policy entirely by utilizing any mechanism other than direct filesystem 
>> access: IPC, shared memory, Unix domain sockets, local IP networking, 
>> remote networking etc.
>> 
> [...]
>   
>> Just look at their code and their own description of AppArmor.
>> 
> My gosh, you're right.  What the heck?  With all due respect to the
> developers of AppArmor, I can't help thinking that that's pretty lame.
> I think this raises substantial questions about the value of AppArmor.
> What is the point of having a jail if it leaves gaping holes that
> malicious code could use to escape?
>
> And why isn't this documented clearly, with the implications fully
> explained?
>
> I would like to hear the AppArmor developers defend this design decision.
>   
It was a simplicity trade off at the time, when AppArmor was mostly
aimed at servers, and there was no HAL or DBUS. Now it is definitely a
limitation that we are addressing. We are working on a mediation system
for what kind of IPC a confined process can do
http://forge.novell.com/pipermail/apparmor-dev/2007-April/000503.html

When our IPC mediation system is code instead of vapor, it will also
appear here for review. Meanwhile, AppArmor does not make IPC security
any worse, confined processes are still subject to the usual Linux IPC
restrictions. AppArmor actually makes the IPC situation somewhat more
secure than stock Linux, e.g. normal DBUS deployment can be controlled
through file access permissions. But we are not claiming AppArmor to be
an IPC security enhancement, yet.

The proposed set of patches is a self-contained access control system
for file system access, and we would like it reviewed as such. Current
AppArmor docs are quite explicit that AppArmor only mediates file access
and POSIX.1e capabilities.

Crispin

-- 
Crispin Cowan, Ph.D.   http://crispincowan.com/~crispin/
Director of Software Engineering   http://novell.com

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Update the list information for kexec and kdump

2007-04-23 Thread Simon Horman

On Mon, Apr 23, 2007 at 12:04:01PM -0600, Eric W. Biederman wrote:
> Simon Horman <[EMAIL PROTECTED]> writes:
> 
> > Update the list information for kexec and kdump
> >
> > Signed-off-by: Simon Horman <[EMAIL PROTECTED]>
> >
> > --- 
> > Is it too early for this change?
> 
> It looks like the new list is working, and isn't likely to get overwhelmed
> with spam.  I don't know if everyone has switched over yet but we can
> certainly update MAINTAINERS. 

Last time I checked there were 28 people in the kexec@ list.
This isn't everyone, but it is getting there.

May I add an Acked-by you ?

> Eric
> 
> > Index: linux-2.6/MAINTAINERS
> > ===
> > --- linux-2.6.orig/MAINTAINERS  2007-04-23 17:34:30.0 +0900
> > +++ linux-2.6/MAINTAINERS   2007-04-23 17:34:47.0 +0900
> > @@ -1951,7 +1951,7 @@ P:Vivek Goyal
> >  M: [EMAIL PROTECTED]
> >  P: Haren Myneni
> >  M: [EMAIL PROTECTED]
> > -L: [EMAIL PROTECTED]
> > +L: [EMAIL PROTECTED]
> >  L: linux-kernel@vger.kernel.org
> >  W: http://lse.sourceforge.net/kdump/
> >  S: Maintained
> > @@ -2001,7 +2001,7 @@ P:Eric Biederman
> >  M: [EMAIL PROTECTED]
> >  W: http://www.xmission.com/~ebiederm/files/kexec/
> >  L: linux-kernel@vger.kernel.org
> > -L: [EMAIL PROTECTED]
> > +L: [EMAIL PROTECTED]
> >  S: Maintained
> >  
> >  KPROBES

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: SLUB: kmem_cache_destroy doesn't - version 2.

On Monday April 23, [EMAIL PROTECTED] wrote:
> On Tue, 24 Apr 2007, Neil Brown wrote:
> 
> > On Monday April 23, [EMAIL PROTECTED] wrote:
> > > Would this work? Contains a solution somewhat along the lines of your 
> > > thoughts on the subject.
> > > 
> > 
> > Concept seems sound.
> > Code needs a kfree of the name returned by create_unique_id, and I
> > think ID_STR_LENGTH needs to be at least 34.
> 
> Sysfs copies the string?

kobject_set_name copies the string, either into a small char array in
the kobject, or into kmalloced space.
kobject_set_name actually takes a format and arbitrary args and uses
vsnprintf, so it has to make it's own copy.

NeilBrown
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: SLUB: kmem_cache_destroy doesn't - version 2.

On Tue, 24 Apr 2007, Neil Brown wrote:

> On Monday April 23, [EMAIL PROTECTED] wrote:
> > Would this work? Contains a solution somewhat along the lines of your 
> > thoughts on the subject.
> > 
> 
> Concept seems sound.
> Code needs a kfree of the name returned by create_unique_id, and I
> think ID_STR_LENGTH needs to be at least 34.

Sysfs copies the string?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Return EPERM not ECHILD on security_task_wait failure

2007-04-23 Thread James Morris

On Mon, 23 Apr 2007, Roland McGrath wrote:

> As I said in some earlier discussion following my original patch, that
> would be fine with me.  I haven't coded up that variant, but it's simple
> enough.  Would you like to do it?

Sure.


-- 
James Morris
<[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: SLUB: kmem_cache_destroy doesn't - version 2.

On Monday April 23, [EMAIL PROTECTED] wrote:
> Would this work? Contains a solution somewhat along the lines of your 
> thoughts on the subject.
> 

Concept seems sound.
Code needs a kfree of the name returned by create_unique_id, and I
think ID_STR_LENGTH needs to be at least 34.
Maybe that should be allocated on the stack in sysfs_slab_add, rather
than using kmalloc/free.

Thanks,
NeilBrown
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Question about Reiser4

2007-04-23 Thread H. Peter Anvin


Theodore Tso wrote:


Now, to be fair, there are probably a number of cases where 
open/lseek/readv/close and open/lseek/writev/close would be worth doing 
as a single system call.  The big problem as far as I can see involves 
EINTR handling; such a system call has serious restartability implications.


Sure, but Hans wants to change /etc/inetd.conf into /etc/inetd.conf.d,
where you have: /etc/inetd.conf.d/telnet/port,
/etc/inetd.conf.d/telnet/protocol, /etc/inetd.conf.d/telnet/wait,
/etc/inetd.conf.d/telnet/userid, /etc/inetd.conf.d/telnet/daemon,
etc. for each individual line in /etc/inetd.conf.  (And where each
file might only contains 2-4 characters each: i.e., "23", "tcp",
"root", etc.)

So it's not enough just to collapse open/pread/close into a single
system call; in order to gain back the performance squandered by all
of these itsy-bitsy tiny little files.  You want to collapse the
open/pread/close for many of these little files into a single system
call, hence Hans's insistence on sys_reiser4(); otherwise his scheme
doesn't work all that well at all.



Heh.  sys_read_tree() -- walk a directory tree and return it as a data 
structure in memory :)


-hpa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH]Fix parsing kernelcore boot option for ia64

2007-04-23 Thread KAMEZAWA Hiroyuki

On Mon, 23 Apr 2007 19:32:46 +0100
[EMAIL PROTECTED] (Mel Gorman) wrote:

> > > I wasn't even aware of this kernelcore thing.  It's pretty nasty-looking. 
> > > yet another reminder that this code hasn't been properly reviewed in the
> > > past year or three.
> > 
> > Just now, I'm making memory-unplug patches with current MOVABLE_ZONE
> > code. So, I might be the first user of it on ia64.
> > 
> > Anyway, I'll try to fix it.
> > 
> 
> Can you review this patch and see does it fix the problem please? There
> was a second problem that showed up while testing this in relation to the
> bootmem allocator assumptions about zone boundary alignment. I'll follow up
> this mail with the patch in case you are seeing that problem.
> 
> Subject: Fix parsing kernelcore boot option V2
> cmdline_parse_kernelcore() should return the next pointer of boot option
> like memparse() doing. If not, it is cause of eternal loop on ia64 box.
> This patch is for 2.6.21-rc6-mm1. This patch changes the kernelcore command
> line parsing so that is compatible with both early_param() way of doing
> things and IA64.
> 
In my understanding, why ia64 doesn't use early_param() macro for mem= at el. 
is that 
it has to use mem= option at efi handling which is called before 
parse_early_param().

Current ia64's boot path is
 setup_arch()
-> efi handling -> parse_early_param() -> numa handling -> pgdat/zone init

kernelcore= option is just used at pgdat/zone initialization. (no arch 
dependent part...)

So I think just adding
==
early_param("kernelcore",cmpdline_parse_kernelcore)
==
to ia64 is ok.

-Kame

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Return EPERM not ECHILD on security_task_wait failure

2007-04-23 Thread Roland McGrath

> On Thu, 15 Mar 2007, Roland McGrath wrote:
> 
> > This patch makes do_wait return -EPERM instead of -ECHILD if some
> > children were ruled out solely because security_task_wait failed.
> 
> What about using the return value from the security_task_wait hook (which 
> should be -EACCES) ?

As I said in some earlier discussion following my original patch, that
would be fine with me.  I haven't coded up that variant, but it's simple
enough.  Would you like to do it?


Thanks,
Roland
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Question about Reiser4

2007-04-23 Thread H. Peter Anvin


Neil Brown wrote:


Our you could think outside the circle:
Store all your "small files" as symlinks, then use "symlink" to create
them and "readlink" to read them. (You would probably end up use
symlinkat and readlinkat).
Only one system call instead of three.
I guess you don't get meaningful permission bits then... I wonder if
that really matters.



For some applications, oh yes it does.

-hpa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Question about Reiser4

2007-04-23 Thread Theodore Tso

On Mon, Apr 23, 2007 at 04:53:03PM -0700, H. Peter Anvin wrote:
> Theodore Tso wrote:
> >
> >One of the big problems of using a filesystem as a DB is the system
> >call overheads.  If you use huge numbers of tiny files, then each
> >attempt read an atom of information from the DB takes three system
> >calls --- an open(), read(), and close(), with all of the overheads in
> >terms of dentry and inode cache.
> >
> 
> Now, to be fair, there are probably a number of cases where 
> open/lseek/readv/close and open/lseek/writev/close would be worth doing 
> as a single system call.  The big problem as far as I can see involves 
> EINTR handling; such a system call has serious restartability implications.

Sure, but Hans wants to change /etc/inetd.conf into /etc/inetd.conf.d,
where you have: /etc/inetd.conf.d/telnet/port,
/etc/inetd.conf.d/telnet/protocol, /etc/inetd.conf.d/telnet/wait,
/etc/inetd.conf.d/telnet/userid, /etc/inetd.conf.d/telnet/daemon,
etc. for each individual line in /etc/inetd.conf.  (And where each
file might only contains 2-4 characters each: i.e., "23", "tcp",
"root", etc.)

So it's not enough just to collapse open/pread/close into a single
system call; in order to gain back the performance squandered by all
of these itsy-bitsy tiny little files.  You want to collapse the
open/pread/close for many of these little files into a single system
call, hence Hans's insistence on sys_reiser4(); otherwise his scheme
doesn't work all that well at all.

- Ted

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Question about Reiser4

On Monday April 23, [EMAIL PROTECTED] wrote:
> Theodore Tso wrote:
> > 
> > One of the big problems of using a filesystem as a DB is the system
> > call overheads.  If you use huge numbers of tiny files, then each
> > attempt read an atom of information from the DB takes three system
> > calls --- an open(), read(), and close(), with all of the overheads in
> > terms of dentry and inode cache.
> > 
> 
> Now, to be fair, there are probably a number of cases where 
> open/lseek/readv/close and open/lseek/writev/close would be worth doing 
> as a single system call.  The big problem as far as I can see involves 
> EINTR handling; such a system call has serious restartability implications.
> 
> Of course, there are Ingo's syslets...

Our you could think outside the circle:
Store all your "small files" as symlinks, then use "symlink" to create
them and "readlink" to read them. (You would probably end up use
symlinkat and readlinkat).
Only one system call instead of three.
I guess you don't get meaningful permission bits then... I wonder if
that really matters.

NeilBrown
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 7/7] libata: send event when AN received

When we get an SDB FIS with the 'N' bit set, we should send
an event to user space to indicate that there has been a
media change.  This will be done via the block device. 

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>
Index: 2.6-git/drivers/ata/ahci.c
===
--- 2.6-git.orig/drivers/ata/ahci.c
+++ 2.6-git/drivers/ata/ahci.c
@@ -1147,6 +1147,25 @@ static void ahci_host_intr(struct ata_po
return;
}
 
+   if (status & PORT_IRQ_SDB_FIS) {
+   /*
+* if this is an ATAPI device with AN turned on,
+* then we should interrogate the device to
+* determine the cause of the interrupt
+*
+* for AN - this we should check the SDB FIS
+* and find the I and N bits set
+*/
+   const u32 *f = pp->rx_fis + RX_FIS_SDB;
+
+   /* check the 'N' bit in word 0 of the FIS */
+   if (f[0] & (1 << 15)) {
+   int port_addr =  ((f[0] & 0x0f00) >> 8);
+   struct ata_device *adev = >device[port_addr];
+   if (adev->flags & ATA_DFLAG_AN)
+   ata_scsi_media_change_notify(adev);
+   }
+   }
if (ap->sactive)
qc_active = readl(port_mmio + PORT_SCR_ACT);
else
Index: 2.6-git/include/linux/libata.h
===
--- 2.6-git.orig/include/linux/libata.h
+++ 2.6-git/include/linux/libata.h
@@ -737,6 +737,7 @@ extern void ata_host_init(struct ata_hos
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct 
scsi_cmnd *));
+extern void ata_scsi_media_change_notify(struct ata_device *atadev);
 extern void ata_sas_port_destroy(struct ata_port *);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
   struct ata_port_info *, struct 
Scsi_Host *);
Index: 2.6-git/drivers/ata/libata-scsi.c
===
--- 2.6-git.orig/drivers/ata/libata-scsi.c
+++ 2.6-git/drivers/ata/libata-scsi.c
@@ -3057,6 +3057,22 @@ static void ata_scsi_remove_dev(struct a
 }
 
 /**
+ * ata_scsi_media_change_notify - send media change event
+ * @atadev: Pointer to the disk device with media change event
+ *
+ * Tell the block layer to send a media change notification
+ * event.
+ *
+ * LOCKING:
+ * interrupt context, may not sleep.
+ */
+void ata_scsi_media_change_notify(struct ata_device *atadev)
+{
+   genhd_media_change_notify(atadev->sdev->disk);
+}
+EXPORT_SYMBOL_GPL(ata_scsi_media_change_notify);
+
+/**
  * ata_scsi_hotplug - SCSI part of hotplug
  * @work: Pointer to ATA port to perform SCSI hotplug on
  *

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 2/7] genhd: expose AN to user space

Allow user space to determine if a disk supports Asynchronous Notification
of media changes.  This is done by adding a new sysfs file "capability_flags",
which is documented in (insert file name).  This sysfs file will export all
disk capabilities flags to user space.  We also define a new flag to define
the media change notification capability.

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>

Index: 2.6-git/block/genhd.c
===
--- 2.6-git.orig/block/genhd.c
+++ 2.6-git/block/genhd.c
@@ -370,7 +370,10 @@ static ssize_t disk_size_read(struct gen
 {
return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk));
 }
-
+static ssize_t disk_capability_read(struct gendisk *disk, char *page)
+{
+   return sprintf(page, "%x\n", disk->flags);
+}
 static ssize_t disk_stats_read(struct gendisk * disk, char *page)
 {
preempt_disable();
@@ -413,6 +416,10 @@ static struct disk_attribute disk_attr_s
.attr = {.name = "size", .mode = S_IRUGO },
.show   = disk_size_read
 };
+static struct disk_attribute disk_attr_capability = {
+   .attr = {.name = "capability_flags", .mode = S_IRUGO },
+   .show   = disk_capability_read
+};
 static struct disk_attribute disk_attr_stat = {
.attr = {.name = "stat", .mode = S_IRUGO },
.show   = disk_stats_read
@@ -453,6 +460,7 @@ static struct attribute * default_attrs[
_attr_removable.attr,
_attr_size.attr,
_attr_stat.attr,
+   _attr_capability.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
_attr_fail.attr,
 #endif
Index: 2.6-git/include/linux/genhd.h
===
--- 2.6-git.orig/include/linux/genhd.h
+++ 2.6-git/include/linux/genhd.h
@@ -94,6 +94,7 @@ struct hd_struct {
 
 #define GENHD_FL_REMOVABLE 1
 #define GENHD_FL_DRIVERFS  2
+#define GENHD_FL_MEDIA_CHANGE_NOTIFY   4
 #define GENHD_FL_CD8
 #define GENHD_FL_UP16
 #define GENHD_FL_SUPPRESS_PARTITION_INFO   32
Index: 2.6-git/Documentation/block/capability_flags.txt
===
--- /dev/null
+++ 2.6-git/Documentation/block/capability_flags.txt
@@ -0,0 +1,15 @@
+Generic Block Device Capability Flags
+===
+This file documents the sysfs file block//capability_flags
+
+capability_flags is a hex word indicating which capabilities a specific
+disk supports.  For more information on bits not listed here, see
+include/linux/genhd.h
+
+Capability Value
+---
+GENHD_FL_MEDIA_CHANGE_NOTIFY   4
+   When this bit is set, the disk supports Asynchronous Notification
+   of media change events.  These events will be broadcast to user
+   space via kernel uevent.
+

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 6/7] SCSI: save disk in scsi_device

Give anyone who has access to scsi_device access to the genhd struct as well.

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>
Index: 2.6-git/drivers/scsi/sd.c
===
--- 2.6-git.orig/drivers/scsi/sd.c
+++ 2.6-git/drivers/scsi/sd.c
@@ -1711,6 +1711,7 @@ static int sd_probe(struct device *dev)
 
dev_set_drvdata(dev, sdkp);
add_disk(gd);
+   sdp->disk = gd;
 
sdev_printk(KERN_NOTICE, sdp, "Attached scsi %sdisk %s\n",
sdp->removable ? "removable " : "", gd->disk_name);
Index: 2.6-git/drivers/scsi/sr.c
===
--- 2.6-git.orig/drivers/scsi/sr.c
+++ 2.6-git/drivers/scsi/sr.c
@@ -604,6 +604,7 @@ static int sr_probe(struct device *dev)
if (sdev->media_change_notify)
disk->flags |= GENHD_FL_MEDIA_CHANGE_NOTIFY;
add_disk(disk);
+   sdev->disk = disk;
 
sdev_printk(KERN_DEBUG, sdev,
"Attached scsi CD-ROM %s\n", cd->cdi.name);
Index: 2.6-git/include/scsi/scsi_device.h
===
--- 2.6-git.orig/include/scsi/scsi_device.h
+++ 2.6-git/include/scsi/scsi_device.h
@@ -138,7 +138,7 @@ struct scsi_device {
 
struct device   sdev_gendev;
struct class_device sdev_classdev;
-
+   struct gendisk  *disk;
struct execute_work ew; /* used to get process context on put */
 
enum scsi_device_state sdev_state;

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 4/7] libata: expose AN to user space

If Asynchronous Notification of media change events is supported,
pass that information up to the SCSI layer.

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>

Index: 2.6-git/drivers/ata/libata-scsi.c
===
--- 2.6-git.orig/drivers/ata/libata-scsi.c
+++ 2.6-git/drivers/ata/libata-scsi.c
@@ -899,6 +899,9 @@ static void ata_scsi_dev_config(struct s
blk_queue_max_hw_segments(q, q->max_hw_segments - 1);
}
 
+   if (dev->flags & ATA_DFLAG_AN)
+   sdev->media_change_notify = 1;
+
if (dev->flags & ATA_DFLAG_NCQ) {
int depth;
 

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 0/7] Asynchronous Notification for ATAPI devices (v2)

This patch series implements Asynchronous Notification (AN) for SATA
ATAPI devices as defined in SATA 2.5 and AHCI 1.1 and higher.  Drives
which support this feature will send a notification when new media is
inserted and removed, preventing the need for user space to poll for
new media.  This support is exposed to user space via a flag that will
be set in /sys/block/sr*/capability_flags.  If the flag is set, user
space can disable polling for the new media, and the genhd driver will
send a KOBJ_CHANGE event with the envp set to MEDIA_CHANGE_EVENT=1.

Note that this patch only implements support for directly attached
drives - AN with drives attached to a port multiplier requires 
additional changes.

Thanks!
Kristen

-- 
-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 3/7] scsi: expose AN to user space

Get media change notification capability from disk and pass this information
to genhd by setting appropriate flag.

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>

Index: 2.6-git/drivers/scsi/sr.c
===
--- 2.6-git.orig/drivers/scsi/sr.c
+++ 2.6-git/drivers/scsi/sr.c
@@ -601,6 +601,8 @@ static int sr_probe(struct device *dev)
 
dev_set_drvdata(dev, cd);
disk->flags |= GENHD_FL_REMOVABLE;
+   if (sdev->media_change_notify)
+   disk->flags |= GENHD_FL_MEDIA_CHANGE_NOTIFY;
add_disk(disk);
 
sdev_printk(KERN_DEBUG, sdev,
Index: 2.6-git/include/scsi/scsi_device.h
===
--- 2.6-git.orig/include/scsi/scsi_device.h
+++ 2.6-git/include/scsi/scsi_device.h
@@ -124,7 +124,7 @@ struct scsi_device {
unsigned fix_capacity:1;/* READ_CAPACITY is too high by 1 */
unsigned guess_capacity:1;  /* READ_CAPACITY might be too high by 1 
*/
unsigned retry_hwerror:1;   /* Retry HARDWARE_ERROR */
-
+   unsigned media_change_notify:1; /* dev supports async media notify */
unsigned int device_blocked;/* Device returned QUEUE_FULL. */
 
unsigned int max_device_blocked; /* what device_blocked counts down 
from  */
Index: 2.6-git/drivers/scsi/sd.c
===
--- 2.6-git.orig/drivers/scsi/sd.c
+++ 2.6-git/drivers/scsi/sd.c
@@ -1706,6 +1706,9 @@ static int sd_probe(struct device *dev)
if (sdp->removable)
gd->flags |= GENHD_FL_REMOVABLE;
 
+   if (sdp->media_change_notify)
+   gd->flags |= GENHD_FL_MEDIA_CHANGE_NOTIFY;
+
dev_set_drvdata(dev, sdkp);
add_disk(gd);
 

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 5/7] genhd: send async notification on media change

Send an uevent to user space to indicate that a media change event has occurred.

Signed-off-by: Kristen Carlson Accardi <[EMAIL PROTECTED]>

Index: 2.6-git/block/genhd.c
===
--- 2.6-git.orig/block/genhd.c
+++ 2.6-git/block/genhd.c
@@ -643,6 +643,25 @@ struct seq_operations diskstats_op = {
.show   = diskstats_show
 };
 
+static void media_change_notify_thread(struct work_struct *work)
+{
+   struct gendisk *gd = container_of(work, struct gendisk, async_notify);
+   char event[] = "MEDIA_CHANGE=1";
+   char *envp[] = { event, NULL };
+
+   /*
+* set enviroment vars to indicate which event this is for
+* so that user space will know to go check the media status.
+*/
+   kobject_uevent_env(>kobj, KOBJ_CHANGE, envp);
+}
+
+void genhd_media_change_notify(struct gendisk *disk)
+{
+   schedule_work(>async_notify);
+}
+EXPORT_SYMBOL_GPL(genhd_media_change_notify);
+
 struct gendisk *alloc_disk(int minors)
 {
return alloc_disk_node(minors, -1);
@@ -672,6 +691,8 @@ struct gendisk *alloc_disk_node(int mino
kobj_set_kset_s(disk,block_subsys);
kobject_init(>kobj);
rand_initialize_disk(disk);
+   INIT_WORK(>async_notify,
+   media_change_notify_thread);
}
return disk;
 }
Index: 2.6-git/include/linux/genhd.h
===
--- 2.6-git.orig/include/linux/genhd.h
+++ 2.6-git/include/linux/genhd.h
@@ -66,6 +66,7 @@ struct partition {
 #include 
 #include 
 #include 
+#include 
 
 struct partition {
unsigned char boot_ind; /* 0x80 - active */
@@ -139,6 +140,7 @@ struct gendisk {
 #else
struct disk_stats dkstats;
 #endif
+   struct work_struct async_notify;
 };
 
 /* Structure for sysfs attributes on block devices */
@@ -419,7 +421,7 @@ extern struct gendisk *alloc_disk_node(i
 extern struct gendisk *alloc_disk(int minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
-
+extern void genhd_media_change_notify(struct gendisk *disk);
 extern void blk_register_region(dev_t dev, unsigned long range,
struct module *module,
struct kobject *(*probe)(dev_t, int *, void *),

-- 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 1/7] libata: check for AN support