[PATCH 13/15] ptwalk: move p?d_none_or_clear_bad

2005-03-09 Thread Hugh Dickins
To handle large sparse areas a little more efficiently, follow Nick and
move the p?d_none_or_clear_bad tests up from the start of each function
to its callsite.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c   |   24 
 mm/mprotect.c |   12 ++--
 mm/msync.c|   12 ++--
 mm/swapfile.c |   12 ++--
 mm/vmalloc.c  |   15 ++-
 5 files changed, 36 insertions(+), 39 deletions(-)

--- ptwalk12/mm/memory.c2005-03-09 01:39:06.0 +
+++ ptwalk13/mm/memory.c2005-03-09 01:39:18.0 +
@@ -113,8 +113,6 @@ void pmd_clear_bad(pmd_t *pmd)
 static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr, unsigned long end)
 {
-   if (pmd_none_or_clear_bad(pmd))
-   return;
if (!((addr | end) & ~PMD_MASK)) {
/* Only free fully aligned ranges */
struct page *page = pmd_page(*pmd);
@@ -132,8 +130,6 @@ static inline void clear_pmd_range(struc
unsigned long next;
pmd_t *empty_pmd = NULL;
 
-   if (pud_none_or_clear_bad(pud))
-   return;
pmd = pmd_offset(pud, addr);
 
/* Only free fully aligned ranges */
@@ -141,6 +137,8 @@ static inline void clear_pmd_range(struc
empty_pmd = pmd;
do {
next = pmd_addr_end(addr, end);
+   if (pmd_none_or_clear_bad(pmd))
+   continue;
clear_pte_range(tlb, pmd, addr, next);
} while (pmd++, addr = next, addr != end);
 
@@ -157,8 +155,6 @@ static inline void clear_pud_range(struc
unsigned long next;
pud_t *empty_pud = NULL;
 
-   if (pgd_none_or_clear_bad(pgd))
-   return;
pud = pud_offset(pgd, addr);
 
/* Only free fully aligned ranges */
@@ -166,6 +162,8 @@ static inline void clear_pud_range(struc
empty_pud = pud;
do {
next = pud_addr_end(addr, end);
+   if (pud_none_or_clear_bad(pud))
+   continue;
clear_pmd_range(tlb, pud, addr, next);
} while (pud++, addr = next, addr != end);
 
@@ -189,6 +187,8 @@ void clear_page_range(struct mmu_gather 
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
+   if (pgd_none_or_clear_bad(pgd))
+   continue;
clear_pud_range(tlb, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
 }
@@ -432,8 +432,6 @@ static void zap_pte_range(struct mmu_gat
 {
pte_t *pte;
 
-   if (pmd_none_or_clear_bad(pmd))
-   return;
pte = pte_offset_map(pmd, addr);
do {
pte_t ptent = *pte;
@@ -505,11 +503,11 @@ static void zap_pmd_range(struct mmu_gat
pmd_t *pmd;
unsigned long next;
 
-   if (pud_none_or_clear_bad(pud))
-   return;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+   if (pmd_none_or_clear_bad(pmd))
+   continue;
zap_pte_range(tlb, pmd, addr, next, details);
} while (pmd++, addr = next, addr != end);
 }
@@ -521,11 +519,11 @@ static void zap_pud_range(struct mmu_gat
pud_t *pud;
unsigned long next;
 
-   if (pgd_none_or_clear_bad(pgd))
-   return;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+   if (pud_none_or_clear_bad(pud))
+   continue;
zap_pmd_range(tlb, pud, addr, next, details);
} while (pud++, addr = next, addr != end);
 }
@@ -545,6 +543,8 @@ static void unmap_page_range(struct mmu_
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
+   if (pgd_none_or_clear_bad(pgd))
+   continue;
zap_pud_range(tlb, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
--- ptwalk12/mm/mprotect.c  2005-03-09 01:36:01.0 +
+++ ptwalk13/mm/mprotect.c  2005-03-09 01:39:18.0 +
@@ -30,8 +30,6 @@ static inline void change_pte_range(stru
 {
pte_t *pte;
 
-   if (pmd_none_or_clear_bad(pmd))
-   return;
pte = pte_offset_map(pmd, addr);
do {
if (pte_present(*pte)) {
@@ -54,11 +52,11 @@ static inline void change_pmd_range(stru
pmd_t *pmd;
unsigned long next;
 
-   if (pud_none_or_clear_bad(pud))
-   return;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+   if (pmd_none_or_clear_bad(pmd))
+   continue;
change_pte_range(mm, pmd, addr, next, newprot);
} while (pmd++, addr = 

Re: Linux 2.6.11-ac1

2005-03-09 Thread CaT
On Wed, Mar 09, 2005 at 05:43:02PM +0100, Bartlomiej Zolnierkiewicz wrote:
> On Wed, 09 Mar 2005 16:38:43 +, Alan Cox <[EMAIL PROTECTED]> wrote:
> > On Mer, 2005-03-09 at 16:26, Bartlomiej Zolnierkiewicz wrote:
> > > It can be merged if somebody fix it to always force controller into
> > > non-RAID mode and remove RAID mode support (which currently
> > > does nothing more besides complicating the driver and making special
> > > commands unusable).
> > 
> > Incorrect
> 
> Very helpful

Argh! Ok. I guess I shouldn't've just bought the card based on this
driver then so that I could better debug my problems with my promise
cards. 8(

-- 
Red herrings strewn hither and yon.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


2/3 swsusp: use non-contiguous memory on ppc

2005-03-09 Thread Pavel Machek
Hi!

This patch contains the necessary changes to the assembly routines
etc. for ppc. It depends on the main resume part. It's a Hu Gang's
patch.

Please apply,
Pavel

From: Hu Gang <[EMAIL PROTECTED]>
Signed-off-by: Rafael J. Wysocki <[EMAIL PROTECTED]>
Signed-off-by: Pavel Machek <[EMAIL PROTECTED]>

diff -Nru linux-2.6.11-a/arch/ppc/Kconfig linux-2.6.11-b/arch/ppc/Kconfig
--- linux-2.6.11-a/arch/ppc/Kconfig 2005-03-02 08:38:33.0 +0100
+++ linux-2.6.11-b/arch/ppc/Kconfig 2005-03-04 18:42:16.0 +0100
@@ -1046,6 +1046,8 @@
 
 source "drivers/zorro/Kconfig"
 
+source kernel/power/Kconfig
+
 endmenu
 
 menu "Bus options"
diff -Nru linux-2.6.11-a/arch/ppc/kernel/asm-offsets.c 
linux-2.6.11-b/arch/ppc/kernel/asm-offsets.c
--- linux-2.6.11-a/arch/ppc/kernel/asm-offsets.c2005-03-02 
08:38:09.0 +0100
+++ linux-2.6.11-b/arch/ppc/kernel/asm-offsets.c2005-03-04 
18:42:16.0 +0100
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -136,6 +137,10 @@
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
 
+   DEFINE(pbe_address, offsetof(struct pbe, address));
+   DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+   DEFINE(pbe_next, offsetof(struct pbe, next));
+
DEFINE(NUM_USER_SEGMENTS, TASK_SIZE>>28);
return 0;
 }
diff -Nru linux-2.6.11-a/arch/ppc/kernel/Makefile 
linux-2.6.11-b/arch/ppc/kernel/Makefile
--- linux-2.6.11-a/arch/ppc/kernel/Makefile 2005-03-02 08:38:25.0 
+0100
+++ linux-2.6.11-b/arch/ppc/kernel/Makefile 2005-03-04 18:42:16.0 
+0100
@@ -16,6 +16,7 @@
semaphore.o syscalls.o setup.o \
cputable.o ppc_htab.o perfmon.o
 obj-$(CONFIG_6xx)  += l2cr.o cpu_setup_6xx.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
 obj-$(CONFIG_POWER4)   += cpu_setup_power4.o
 obj-$(CONFIG_MODULES)  += module.o ppc_ksyms.o
 obj-$(CONFIG_NOT_COHERENT_CACHE)   += dma-mapping.o
diff -Nru linux-2.6.11-a/arch/ppc/kernel/signal.c 
linux-2.6.11-b/arch/ppc/kernel/signal.c
--- linux-2.6.11-a/arch/ppc/kernel/signal.c 2005-03-02 08:38:33.0 
+0100
+++ linux-2.6.11-b/arch/ppc/kernel/signal.c 2005-03-04 18:42:16.0 
+0100
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -704,6 +705,14 @@
unsigned long frame, newsp;
int signr, ret;
 
+   if (current->flags & PF_FREEZE) {
+   refrigerator(PF_FREEZE);
+   signr = 0;
+   ret = regs->gpr[3];
+   if (!signal_pending(current))
+   goto no_signal;
+   }
+
if (!oldset)
oldset = >blocked;
 
@@ -726,6 +735,7 @@
regs->gpr[3] = EINTR;
/* note that the cr0.SO bit is already set */
} else {
+no_signal:
regs->nip -= 4; /* Back up & retry system call */
regs->result = 0;
regs->trap = 0;
diff -Nru linux-2.6.11-a/arch/ppc/kernel/swsusp.S 
linux-2.6.11-b/arch/ppc/kernel/swsusp.S
--- linux-2.6.11-a/arch/ppc/kernel/swsusp.S 1970-01-01 01:00:00.0 
+0100
+++ linux-2.6.11-b/arch/ppc/kernel/swsusp.S 2005-03-04 18:42:16.0 
+0100
@@ -0,0 +1,349 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+/*
+ * Structure for storing CPU registers on the save area.
+ */
+#define SL_SP  0
+#define SL_PC  4
+#define SL_MSR 8
+#define SL_SDR10xc
+#define SL_SPRG0   0x10/* 4 sprg's */
+#define SL_DBAT0   0x20
+#define SL_IBAT0   0x28
+#define SL_DBAT1   0x30
+#define SL_IBAT1   0x38
+#define SL_DBAT2   0x40
+#define SL_IBAT2   0x48
+#define SL_DBAT3   0x50
+#define SL_IBAT3   0x58
+#define SL_TB  0x60
+#define SL_R2  0x68
+#define SL_CR  0x6c
+#define SL_LR  0x70
+#define SL_R12 0x74/* r12 to r31 */
+#define SL_SIZE(SL_R12 + 80)
+
+   .section .data
+   .align  5
+
+_GLOBAL(swsusp_save_area)
+   .space  SL_SIZE
+
+
+   .section .text
+   .align  5
+
+_GLOBAL(swsusp_arch_suspend)
+
+   lis r11,[EMAIL PROTECTED]
+   ori r11,r11,[EMAIL PROTECTED]
+
+   mflrr0
+   stw r0,SL_LR(r11)
+   mfcrr0
+   stw r0,SL_CR(r11)
+   stw r1,SL_SP(r11)
+   stw r2,SL_R2(r11)
+   stmwr12,SL_R12(r11)
+
+   /* Save MSR & SDR1 */
+   mfmsr   r4
+   stw r4,SL_MSR(r11)
+   mfsdr1  r4
+   stw r4,SL_SDR1(r11)
+
+   /* Get a stable timebase and save it */
+1: mftbu   r4
+   stw r4,SL_TB(r11)
+  

[PATCH 11/15] ptwalk: copy_pte_range hang

2005-03-09 Thread Hugh Dickins
This patch is the odd-one-out of the sequence.  The one before adjusted
copy_pte_range from a for loop to a do while loop, and it was therefore
simplest to check for lockbreak before copying pte: possibility that it
might keep getting preempted without making progress under some loads.

Some loads such as startup: 2*HT*P4 with preemption cannot even reach
multiuser login.  Suspect needs_lockbreak is broken, can get in a state
when it remains forever true.  Investigate that later: for now, and for
all time, it makes sense to aim for a little progress before breaking
out; and we can manage more pte_nones than copies.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |   11 ---
 1 files changed, 8 insertions(+), 3 deletions(-)

--- ptwalk10/mm/memory.c2005-03-09 01:38:12.0 +
+++ ptwalk11/mm/memory.c2005-03-09 01:38:54.0 +
@@ -328,6 +328,7 @@ static int copy_pte_range(struct mm_stru
 {
pte_t *src_pte, *dst_pte;
unsigned long vm_flags = vma->vm_flags;
+   int progress;
 
 again:
dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
@@ -335,19 +336,23 @@ again:
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
 
+   progress = 0;
spin_lock(_mm->page_table_lock);
do {
/*
 * We are holding two locks at this point - either of them
 * could generate latencies in another task on another CPU.
 */
-   if (need_resched() ||
+   if (progress >= 32 && (need_resched() ||
need_lockbreak(_mm->page_table_lock) ||
-   need_lockbreak(_mm->page_table_lock))
+   need_lockbreak(_mm->page_table_lock)))
break;
-   if (pte_none(*src_pte))
+   if (pte_none(*src_pte)) {
+   progress++;
continue;
+   }
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+   progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
spin_unlock(_mm->page_table_lock);
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/15] ptwalk: clear_page_range

2005-03-09 Thread Hugh Dickins
Convert clear_page_range pagetable walkers to loops using p?d_addr_end.
These are exceptional in that some out-of-tree memory layouts might pass
end 0, so the macros need to handle that (though previous code did not).

The naming here was out of step: now we usually pass pmd_t *pmd down to
action_on_pte_range, not action_on_pmd_range, etc: made like the others.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |   98 
 1 files changed, 46 insertions(+), 52 deletions(-)

--- ptwalk11/mm/memory.c2005-03-09 01:38:54.0 +
+++ ptwalk12/mm/memory.c2005-03-09 01:39:06.0 +
@@ -110,15 +110,14 @@ void pmd_clear_bad(pmd_t *pmd)
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
-static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, 
unsigned long start, unsigned long end)
+static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+   unsigned long addr, unsigned long end)
 {
-   struct page *page;
-
if (pmd_none_or_clear_bad(pmd))
return;
-   if (!((start | end) & ~PMD_MASK)) {
-   /* Only clear full, aligned ranges */
-   page = pmd_page(*pmd);
+   if (!((addr | end) & ~PMD_MASK)) {
+   /* Only free fully aligned ranges */
+   struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -126,77 +125,72 @@ static inline void clear_pmd_range(struc
}
 }
 
-static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, 
unsigned long start, unsigned long end)
+static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+   unsigned long addr, unsigned long end)
 {
-   unsigned long addr = start, next;
-   pmd_t *pmd, *__pmd;
+   pmd_t *pmd;
+   unsigned long next;
+   pmd_t *empty_pmd = NULL;
 
if (pud_none_or_clear_bad(pud))
return;
-   pmd = __pmd = pmd_offset(pud, start);
+   pmd = pmd_offset(pud, addr);
+
+   /* Only free fully aligned ranges */
+   if (!((addr | end) & ~PUD_MASK))
+   empty_pmd = pmd;
do {
-   next = (addr + PMD_SIZE) & PMD_MASK;
-   if (next > end || next <= addr)
-   next = end;
-   
-   clear_pmd_range(tlb, pmd, addr, next);
-   pmd++;
-   addr = next;
-   } while (addr && (addr < end));
+   next = pmd_addr_end(addr, end);
+   clear_pte_range(tlb, pmd, addr, next);
+   } while (pmd++, addr = next, addr != end);
 
-   if (!((start | end) & ~PUD_MASK)) {
-   /* Only clear full, aligned ranges */
+   if (empty_pmd) {
pud_clear(pud);
-   pmd_free_tlb(tlb, __pmd);
+   pmd_free_tlb(tlb, empty_pmd);
}
 }
 
-
-static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, 
unsigned long start, unsigned long end)
+static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+   unsigned long addr, unsigned long end)
 {
-   unsigned long addr = start, next;
-   pud_t *pud, *__pud;
+   pud_t *pud;
+   unsigned long next;
+   pud_t *empty_pud = NULL;
 
if (pgd_none_or_clear_bad(pgd))
return;
-   pud = __pud = pud_offset(pgd, start);
+   pud = pud_offset(pgd, addr);
+
+   /* Only free fully aligned ranges */
+   if (!((addr | end) & ~PGDIR_MASK))
+   empty_pud = pud;
do {
-   next = (addr + PUD_SIZE) & PUD_MASK;
-   if (next > end || next <= addr)
-   next = end;
-   
-   clear_pud_range(tlb, pud, addr, next);
-   pud++;
-   addr = next;
-   } while (addr && (addr < end));
+   next = pud_addr_end(addr, end);
+   clear_pmd_range(tlb, pud, addr, next);
+   } while (pud++, addr = next, addr != end);
 
-   if (!((start | end) & ~PGDIR_MASK)) {
-   /* Only clear full, aligned ranges */
+   if (empty_pud) {
pgd_clear(pgd);
-   pud_free_tlb(tlb, __pud);
+   pud_free_tlb(tlb, empty_pud);
}
 }
 
 /*
  * This function clears user-level page tables of a process.
- *
+ * Unlike other pagetable walks, some memory layouts might give end 0.
  * Must be called with pagetable lock held.
  */
-void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned 
long end)
+void clear_page_range(struct mmu_gather *tlb,
+   unsigned long addr, unsigned long end)
 {
-   unsigned long addr = start, next;
-   pgd_t * pgd = 

Re: aio stress panic on 2.6.11-mm1

2005-03-09 Thread David Howells
Arjan van de Ven <[EMAIL PROTECTED]> wrote:

> On Wed, 2005-03-09 at 16:34 +0530, Suparna Bhattacharya wrote:
> > Any sense of how costly it is to use spin_lock_irq's vs spin_lock
> > (across different architectures) ? Isn't rwsem used very widely ?
> 
> oh also rwsems aren't used all that much simply because they are quite
> more expensive than regular semaphores, so that you need a HUGE bias in
> reader/writer ratio to make it even worth it...

I can put some numbers to that. I've attached the module that I use for
destruct testing rwsems. I've extended it to splat ordinary semaphores too.

You run it by insmod'ing it. It will hammer the box for a few seconds, print a
summary to the console log and then return -ENOANO to cause insmod to abort
module addition.

You can give it a number of parameters:

mx=NNumber of mutex splatting threads to run
rd=NNumber of rwsem read-splatting threads to run
wr=NNumber of rwsem write-splatting threads to run
dg=NNumber of rwsem downgrader threads ro run
elapse=NNumber of seconds to run for (default 5)
do_sched=1  Schedule occasionally
load=N  Number of microseconds of load
verbose=0   Print only a summary (as in the table below)
verbose=1   Print more information

So, some statistics for a dual 200MHz Pentium Pro box, running the test for
the default time:

MODULE PARAM  RESULTS
= ===
mx  rd  wr  dg  S ld  mutexes   reads writesdowngrade
=== === === === = === = = = =

With no load and without scheduling:

  1   0   0   0 -   0   7331475 0 0 0
  1   0   0   0 -   0   7465404 0 0 0
  1   0   0   0 -   0   7319429 0 0 0
  0   1   0   0 -   0 0   7743129 0 0
  0   1   0   0 -   0 0   7698473 0 0
  0   1   0   0 -   0 0   7614090 0 0
  0   0   1   0 -   0 0 0   7051591 0
  0   0   1   0 -   0 0 0   7027214 0
  0   0   1   0 -   0 0 0   7054375 0
  0   1   1   0 -   0 0119838106730 0
  0   1   1   0 -   0 0637862 96867 0
  0   1   1   0 -   0 0520168 89630 0

 10   0   0   0 -   0   1068401 0 0 0
 10   0   0   0 -   0   1035501 0 0 0
 10   0   0   0 -   0   1170587 0 0 0
  0  10   0   0 -   0 0   2865253 0 0
  0  10   0   0 -   0 0   298 0 0
  0  10   0   0 -   0 0   2969689 0 0
  0   0  10   0 -   0 0 0503357 0
  0   0  10   0 -   0 0 0657964 0
  0   0  10   0 -   0 0 0758048 0
  0  10  10   0 -   0 0382710117488 0
  0  10  10   0 -   0 0519159121845 0
  0  10  10   0 -   0 0639660103995 0
  0  10   1   0 -   0 0   2876112 0 0
  0  10   1   0 -   0 0   2954678 0 0
  0  10   1   0 -   0 0   1438340 37437 0

With no load and with occasional scheduling:

  0   1   1   0 s   0 0130326110929 0
  0   1   1   0 s   0 0135551 99816 0
  0   1   1   0 s   0 0136236117179 0

 10   0   0   0 s   0283945 0 0 0
 10   0   0   0 s   0253822 0 0 0
 10   0   0   0 s   0275887 0 0 0
  0  10   0   0 s   0 0   2398587 0 0
  0  10   0   0 s   0 0   2329326 0 0
  0  10   0   0 s   0 0   2326537 0 0
  0   0  10   0 s   0 0 0305368 0
  0   0  10   0 s   0 0 0277164 0
  0   0  10   0 s   0 0 0310533 0
  0  10  10   0 s   0 0155986156444 0
  0  10  10   0 s   0 0141763172333 0
  0  10  10   0 s   0 0157835130404 0
  0  10   1   0 s   0 0   2120867  4928 0
  0  10   1   0 s   0 0   2076650 11902 0
  0  10   1   0 s   0 0   2243057 16289 0

With a load of 2uS:

  1   0   0   0 -  

RE: Direct io on block device has performance regression on 2.6.x kernel

2005-03-09 Thread Chen, Kenneth W
Chen, Kenneth W wrote on Wednesday, March 09, 2005 1:59 PM
> > Did you generate a kernel profile?
>
> Top 40 kernel hot functions, percentage is normalized to kernel utilization.
>
> _spin_unlock_irqrestore   23.54%
> _spin_unlock_irq  19.27%
> 
>
> Profile with spin lock inlined, so that it is easier to see functions
> that has the lock contention, again top 40 hot functions:

Just to clarify here, these data need to be taken at grain of salt. A
high count in _spin_unlock_* functions do not automatically points to
lock contention.  It's one of the blind spot syndrome with timer based
profile on ia64.  There are some lock contentions in 2.6 kernel that
we are staring at.  Please do not misinterpret the number here.

- Ken


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 15/15] ptwalk: pud and pmd folded

2005-03-09 Thread Hugh Dickins
[PATCH 15/15] ptwalk: pud and pmd folded

Nick Piggin's patch to fold away most of the pud and pmd levels when not
required.  Adjusted to define minimal pud_addr_end (in the 4LEVEL_HACK
case too) and pmd_addr_end.  Responsible for half of the savings.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 include/asm-generic/4level-fixup.h  |4 
 include/asm-generic/pgtable-nopmd.h |5 +
 include/asm-generic/pgtable-nopud.h |5 +
 include/asm-generic/pgtable.h   |4 
 mm/memory.c |   34 --
 5 files changed, 26 insertions(+), 26 deletions(-)

--- ptwalk14/include/asm-generic/4level-fixup.h 2005-03-02 07:39:19.0 
+
+++ ptwalk15/include/asm-generic/4level-fixup.h 2005-03-09 01:39:43.0 
+
@@ -2,6 +2,7 @@
 #define _4LEVEL_FIXUP_H
 
 #define __ARCH_HAS_4LEVEL_HACK
+#define __PAGETABLE_PUD_FOLDED
 
 #define PUD_SIZE   PGDIR_SIZE
 #define PUD_MASK   PGDIR_MASK
@@ -31,4 +32,7 @@
 #define pud_free(x)do { } while (0)
 #define __pud_free_tlb(tlb, x) do { } while (0)
 
+#undef  pud_addr_end
+#define pud_addr_end(addr, end)(end)
+
 #endif
--- ptwalk14/include/asm-generic/pgtable-nopmd.h2005-03-02 
07:39:19.0 +
+++ ptwalk15/include/asm-generic/pgtable-nopmd.h2005-03-09 
01:39:43.0 +
@@ -5,6 +5,8 @@
 
 #include 
 
+#define __PAGETABLE_PMD_FOLDED
+
 /*
  * Having the pmd type consist of a pud gets the size right, and allows
  * us to conceptually access the pud entry that this pmd is folded into
@@ -55,6 +57,9 @@ static inline pmd_t * pmd_offset(pud_t *
 #define pmd_free(x)do { } while (0)
 #define __pmd_free_tlb(tlb, x) do { } while (0)
 
+#undef  pmd_addr_end
+#define pmd_addr_end(addr, end)(end)
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _PGTABLE_NOPMD_H */
--- ptwalk14/include/asm-generic/pgtable-nopud.h2005-03-02 
07:39:27.0 +
+++ ptwalk15/include/asm-generic/pgtable-nopud.h2005-03-09 
01:39:43.0 +
@@ -3,6 +3,8 @@
 
 #ifndef __ASSEMBLY__
 
+#define __PAGETABLE_PUD_FOLDED
+
 /*
  * Having the pud type consist of a pgd gets the size right, and allows
  * us to conceptually access the pgd entry that this pud is folded into
@@ -52,5 +54,8 @@ static inline pud_t * pud_offset(pgd_t *
 #define pud_free(x)do { } while (0)
 #define __pud_free_tlb(tlb, x) do { } while (0)
 
+#undef  pud_addr_end
+#define pud_addr_end(addr, end)(end)
+
 #endif /* __ASSEMBLY__ */
 #endif /* _PGTABLE_NOPUD_H */
--- ptwalk14/include/asm-generic/pgtable.h  2005-03-09 01:36:01.0 
+
+++ ptwalk15/include/asm-generic/pgtable.h  2005-03-09 01:39:43.0 
+
@@ -146,15 +146,19 @@ static inline void ptep_set_wrprotect(st
(__boundary - 1 < (end) - 1)? __boundary: (end);\
 })
 
+#ifndef pud_addr_end
 #define pud_addr_end(addr, end)
\
 ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;  \
(__boundary - 1 < (end) - 1)? __boundary: (end);\
 })
+#endif
 
+#ifndef pmd_addr_end
 #define pmd_addr_end(addr, end)
\
 ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;  \
(__boundary - 1 < (end) - 1)? __boundary: (end);\
 })
+#endif
 
 #ifndef __ASSEMBLY__
 /*
--- ptwalk14/mm/memory.c2005-03-09 01:39:31.0 +
+++ ptwalk15/mm/memory.c2005-03-09 01:39:43.0 +
@@ -1973,15 +1973,12 @@ int handle_mm_fault(struct mm_struct *mm
return VM_FAULT_OOM;
 }
 
-#ifndef __ARCH_HAS_4LEVEL_HACK
+#ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
  *
  * We've already handled the fast-path in-line, and we own the
  * page table lock.
- *
- * On a two-level or three-level page table, this ends up actually being
- * entirely optimized away.
  */
 pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long 
address)
 {
@@ -2005,15 +2002,14 @@ pud_t fastcall *__pud_alloc(struct mm_st
  out:
return pud_offset(pgd, address);
 }
+#endif /* __PAGETABLE_PUD_FOLDED */
 
+#ifndef __PAGETABLE_PMD_FOLDED
 /*
  * Allocate page middle directory.
  *
  * We've already handled the fast-path in-line, and we own the
  * page table lock.
- *
- * On a two-level page table, this ends up actually being entirely
- * optimized away.
  */
 pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long 
address)
 {
@@ -2029,38 +2025,24 @@ pmd_t fastcall *__pmd_alloc(struct mm_st
 * Because we dropped the lock, we should re-check the
 * entry, as somebody else could have populated it..
 */
+#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) {
  

[PATCH 14/15] ptwalk: inline pmd_range and pud_range

2005-03-09 Thread Hugh Dickins
As a general rule, ask the compiler to inline action_on_pmd_range and
action_on_pud_range: they're none very interesting, and it has a better
chance of eliding them that way.  But conversely, it helps debug traces
if action_on_pte_range and top action_on_page_range remain uninlined.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c   |   10 +-
 mm/mprotect.c |2 +-
 mm/msync.c|4 ++--
 mm/swapfile.c |4 ++--
 mm/vmalloc.c  |   18 ++
 5 files changed, 20 insertions(+), 18 deletions(-)

--- ptwalk13/mm/memory.c2005-03-09 01:39:18.0 +
+++ ptwalk14/mm/memory.c2005-03-09 01:39:31.0 +
@@ -358,7 +358,7 @@ again:
return 0;
 }
 
-static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
@@ -380,7 +380,7 @@ static int copy_pmd_range(struct mm_stru
return 0;
 }
 
-static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
@@ -496,7 +496,7 @@ static void zap_pte_range(struct mmu_gat
pte_unmap(pte - 1);
 }
 
-static void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
 {
@@ -512,7 +512,7 @@ static void zap_pmd_range(struct mmu_gat
} while (pmd++, addr = next, addr != end);
 }
 
-static void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
 {
@@ -1013,7 +1013,7 @@ int zeromap_page_range(struct vm_area_st
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
-static inline int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
 {
--- ptwalk13/mm/mprotect.c  2005-03-09 01:39:18.0 +
+++ ptwalk14/mm/mprotect.c  2005-03-09 01:39:31.0 +
@@ -25,7 +25,7 @@
 #include 
 #include 
 
-static inline void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot)
 {
pte_t *pte;
--- ptwalk13/mm/msync.c 2005-03-09 01:39:18.0 +
+++ ptwalk14/mm/msync.c 2005-03-09 01:39:31.0 +
@@ -105,7 +105,7 @@ static void sync_page_range(struct vm_ar
 }
 
 #ifdef CONFIG_PREEMPT
-static void filemap_sync(struct vm_area_struct *vma,
+static inline void filemap_sync(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
const size_t chunk = 64 * 1024; /* bytes */
@@ -120,7 +120,7 @@ static void filemap_sync(struct vm_area_
} while (addr = next, addr != end);
 }
 #else
-static void filemap_sync(struct vm_area_struct *vma,
+static inline void filemap_sync(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
sync_page_range(vma, addr, end);
--- ptwalk13/mm/swapfile.c  2005-03-09 01:39:18.0 +
+++ ptwalk14/mm/swapfile.c  2005-03-09 01:39:31.0 +
@@ -458,7 +458,7 @@ static int unuse_pte_range(struct vm_are
return 0;
 }
 
-static int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
 {
@@ -476,7 +476,7 @@ static int unuse_pmd_range(struct vm_are
return 0;
 }
 
-static int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
 {
--- ptwalk13/mm/vmalloc.c   2005-03-09 01:39:18.0 +
+++ ptwalk14/mm/vmalloc.c   2005-03-09 01:39:31.0 +
@@ -34,7 +34,8 @@ static void vunmap_pte_range(pmd_t *pmd,
} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static void vunmap_pmd_range(pud_t *pud, 

Re: [PATCH] drm missing memset can crash X server...

2005-03-09 Thread Dave Airlie
>
> Could you please add Signed-off-by?  Do I read this patch correctly that
> it effectively disables the DRM_COPY in ->version callbacks?

I'll resend the patch now .. no it just zeros out the structure on the
stack so that the version callback doesn't get a garbage structure to copy
into...

Dave.


-- 
David Airlie, Software Engineer
http://www.skynet.ie/~airlied / airlied at skynet.ie
Linux kernel - DRI, VAX / pam_smb / ILUG

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 8/15] ptwalk: zeromap_page_range

2005-03-09 Thread Hugh Dickins
Convert zeromap_page_range pagetable walkers to loops using p?d_addr_end.
Remove the redundant flush_tlb_range from afterwards: as its comment
noted, there's already a BUG_ON(!pte_none).

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |  143 ++--
 1 files changed, 54 insertions(+), 89 deletions(-)

--- ptwalk7/mm/memory.c 2005-03-09 01:37:02.0 +
+++ ptwalk8/mm/memory.c 2005-03-09 01:37:15.0 +
@@ -975,113 +975,78 @@ out:
 
 EXPORT_SYMBOL(get_user_pages);
 
-static void zeromap_pte_range(struct mm_struct *mm, pte_t * pte,
- unsigned long address,
- unsigned long size, pgprot_t prot)
-{
-   unsigned long base, end;
-
-   base = address & PMD_MASK;
-   address &= ~PMD_MASK;
-   end = address + size;
-   if (end > PMD_SIZE)
-   end = PMD_SIZE;
+static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+   unsigned long addr, unsigned long end, pgprot_t prot)
+{
+   pte_t *pte;
+
+   pte = pte_alloc_map(mm, pmd, addr);
+   if (!pte)
+   return -ENOMEM;
do {
-   pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(base+address), 
prot));
+   pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
BUG_ON(!pte_none(*pte));
-   set_pte_at(mm, base+address, pte, zero_pte);
-   address += PAGE_SIZE;
-   pte++;
-   } while (address && (address < end));
-}
-
-static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd,
-   unsigned long address, unsigned long size, pgprot_t prot)
-{
-   unsigned long base, end;
-
-   base = address & PUD_MASK;
-   address &= ~PUD_MASK;
-   end = address + size;
-   if (end > PUD_SIZE)
-   end = PUD_SIZE;
+   set_pte_at(mm, addr, pte, zero_pte);
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+   pte_unmap(pte - 1);
+   return 0;
+}
+
+static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
+   unsigned long addr, unsigned long end, pgprot_t prot)
+{
+   pmd_t *pmd;
+   unsigned long next;
+
+   pmd = pmd_alloc(mm, pud, addr);
+   if (!pmd)
+   return -ENOMEM;
do {
-   pte_t * pte = pte_alloc_map(mm, pmd, base + address);
-   if (!pte)
+   next = pmd_addr_end(addr, end);
+   if (zeromap_pte_range(mm, pmd, addr, next, prot))
return -ENOMEM;
-   zeromap_pte_range(mm, pte, base + address, end - address, prot);
-   pte_unmap(pte);
-   address = (address + PMD_SIZE) & PMD_MASK;
-   pmd++;
-   } while (address && (address < end));
+   } while (pmd++, addr = next, addr != end);
return 0;
 }
 
-static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud,
-   unsigned long address,
-unsigned long size, pgprot_t prot)
-{
-   unsigned long base, end;
-   int error = 0;
-
-   base = address & PGDIR_MASK;
-   address &= ~PGDIR_MASK;
-   end = address + size;
-   if (end > PGDIR_SIZE)
-   end = PGDIR_SIZE;
+static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+   unsigned long addr, unsigned long end, pgprot_t prot)
+{
+   pud_t *pud;
+   unsigned long next;
+
+   pud = pud_alloc(mm, pgd, addr);
+   if (!pud)
+   return -ENOMEM;
do {
-   pmd_t * pmd = pmd_alloc(mm, pud, base + address);
-   error = -ENOMEM;
-   if (!pmd)
-   break;
-   error = zeromap_pmd_range(mm, pmd, base + address,
- end - address, prot);
-   if (error)
-   break;
-   address = (address + PUD_SIZE) & PUD_MASK;
-   pud++;
-   } while (address && (address < end));
+   next = pud_addr_end(addr, end);
+   if (zeromap_pmd_range(mm, pud, addr, next, prot))
+   return -ENOMEM;
+   } while (pud++, addr = next, addr != end);
return 0;
 }
 
-int zeromap_page_range(struct vm_area_struct *vma, unsigned long address,
-   unsigned long size, pgprot_t prot)
+int zeromap_page_range(struct vm_area_struct *vma,
+   unsigned long addr, unsigned long size, pgprot_t prot)
 {
-   int i;
-   int error = 0;
-   pgd_t * pgd;
-   unsigned long beg = address;
-   unsigned long end = address + size;
+   pgd_t *pgd;
unsigned long next;
+   unsigned long end = addr + size;
struct mm_struct *mm = vma->vm_mm;
+   int err;
 
-   pgd = pgd_offset(mm, 

[PATCH 9/15] ptwalk: unmap_page_range

2005-03-09 Thread Hugh Dickins
Convert unmap_page_range pagetable walkers to loops using p?d_addr_end.
Move blanking of irrelevant details up to unmap_page_range as Nick did.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |  119 ++--
 1 files changed, 53 insertions(+), 66 deletions(-)

--- ptwalk8/mm/memory.c 2005-03-09 01:37:15.0 +
+++ ptwalk9/mm/memory.c 2005-03-09 01:38:00.0 +
@@ -454,29 +454,22 @@ next_pgd:
return err;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb,
-   pmd_t *pmd, unsigned long address,
-   unsigned long size, struct zap_details *details)
+static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+   unsigned long addr, unsigned long end,
+   struct zap_details *details)
 {
-   unsigned long offset;
-   pte_t *ptep;
+   pte_t *pte;
 
if (pmd_none_or_clear_bad(pmd))
return;
-   ptep = pte_offset_map(pmd, address);
-   offset = address & ~PMD_MASK;
-   if (offset + size > PMD_SIZE)
-   size = PMD_SIZE - offset;
-   size &= PAGE_MASK;
-   if (details && !details->check_mapping && !details->nonlinear_vma)
-   details = NULL;
-   for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
-   pte_t pte = *ptep;
-   if (pte_none(pte))
+   pte = pte_offset_map(pmd, addr);
+   do {
+   pte_t ptent = *pte;
+   if (pte_none(ptent))
continue;
-   if (pte_present(pte)) {
+   if (pte_present(ptent)) {
struct page *page = NULL;
-   unsigned long pfn = pte_pfn(pte);
+   unsigned long pfn = pte_pfn(ptent);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageReserved(page))
@@ -500,20 +493,20 @@ static void zap_pte_range(struct mmu_gat
 page->index > details->last_index))
continue;
}
-   pte = ptep_get_and_clear(tlb->mm, address+offset, ptep);
-   tlb_remove_tlb_entry(tlb, ptep, address+offset);
+   ptent = ptep_get_and_clear(tlb->mm, addr, pte);
+   tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
-   address+offset) != page->index)
-   set_pte_at(tlb->mm, address+offset,
-  ptep, pgoff_to_pte(page->index));
-   if (pte_dirty(pte))
+   addr) != page->index)
+   set_pte_at(tlb->mm, addr, pte,
+  pgoff_to_pte(page->index));
+   if (pte_dirty(ptent))
set_page_dirty(page);
if (PageAnon(page))
tlb->mm->anon_rss--;
-   else if (pte_young(pte))
+   else if (pte_young(ptent))
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page);
@@ -526,68 +519,62 @@ static void zap_pte_range(struct mmu_gat
 */
if (unlikely(details))
continue;
-   if (!pte_file(pte))
-   free_swap_and_cache(pte_to_swp_entry(pte));
-   pte_clear(tlb->mm, address+offset, ptep);
-   }
-   pte_unmap(ptep-1);
+   if (!pte_file(ptent))
+   free_swap_and_cache(pte_to_swp_entry(ptent));
+   pte_clear(tlb->mm, addr, pte);
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+   pte_unmap(pte - 1);
 }
 
-static void zap_pmd_range(struct mmu_gather *tlb,
-   pud_t *pud, unsigned long address,
-   unsigned long size, struct zap_details *details)
+static void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+   unsigned long addr, unsigned long end,
+   struct zap_details *details)
 {
-   pmd_t * pmd;
-   unsigned long end;
+   pmd_t *pmd;
+   unsigned long next;
 
if (pud_none_or_clear_bad(pud))
return;
-   pmd = pmd_offset(pud, address);
-   end = address + size;
-   if (end > ((address + PUD_SIZE) & PUD_MASK))
-   end = ((address + PUD_SIZE) & PUD_MASK);
+   pmd = pmd_offset(pud, addr);

Re: [Ext2-devel] Re: inode cache, dentry cache, buffer heads usage

2005-03-09 Thread Sonny Rao
On Thu, Mar 10, 2005 at 03:23:49AM +0530, Dipankar Sarma wrote:
> On Wed, Mar 09, 2005 at 01:29:23PM -0800, Badari Pulavarty wrote:
> > On Wed, 2005-03-09 at 13:27, Dipankar Sarma wrote:
> > > On Wed, Mar 09, 2005 at 10:55:58AM -0800, Badari Pulavarty wrote:
> > > > Hi,
> > > > 
> > > > We have a 8-way P-III, 16GB RAM running 2.6.8-1. We use this as
> > > > our server to keep source code, cscopes and do the builds.
> > > > This machine seems to slow down over the time. One thing we
> > > > keep noticing is it keeps running out of lowmem. Most of 
> > > > the lowmem is used for ext3 inode cache + dentry cache +
> > > > bufferheads + Buffers. So we did 2:2 split - but it improved
> > > > thing, but again run into same issues.
> > > > 
> > > > So, why is these slab cache are not getting purged/shrinked even
> > > > under memory pressure ? (I have seen lowmem as low as 6MB). What
> > > > can I do to keep the machine healthy ?
> > > 
> > > How does /proc/sys/fs/dentry-state look when you run low on lowmem ?
> > 
> > 
> > 
> > [EMAIL PROTECTED]:~$ cat /proc/sys/fs/dentry-state
> > 1434093 1348947 45  0   0   0
> > [EMAIL PROTECTED]:~$ grep dentry /proc/slabinfo
> > dentry_cache  1434094 1857519144   271 : tunables  120  
> > 608 : slabdata  68797  68797  0
> 
> Hmm.. so we are not shrinking dcache despite a large number of
> unsed dentries. That is where we need to look. Will dig a bit
> tomorrow.

Here's my really old patch where I saw some improvement for this scenario...

I haven't tried this in a really long time, so I have no idea if it'll
work :-) 


Sonny
--- fs/dcache.c.original2004-08-02 15:43:42.629539312 -0500
+++ fs/dcache.c 2004-08-03 18:16:45.007809144 -0500
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* #define DCACHE_DEBUG 1 */
 
@@ -60,12 +61,61 @@ static unsigned int d_hash_mask;
 static unsigned int d_hash_shift;
 static struct hlist_head *dentry_hashtable;
 static LIST_HEAD(dentry_unused);
+static struct rb_root dentry_tree = RB_ROOT;
+
+#define RB_NONE (2)
+#define ON_RB(node)((node)->rb_color != RB_NONE)
+#define RB_CLEAR(node) ((node)->rb_color = RB_NONE )
 
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
.age_limit = 45,
 };
 
+
+/* take a dentry safely off the rbtree */
+static void drb_delete(struct dentry* dentry)
+{
+  //  printk("drb_delete: 0x%p (%s) proc 
%d\n",dentry,dentry->d_iname,smp_processor_id());
+   if (ON_RB(>d_rb)) {
+   rb_erase(>d_rb, _tree);
+   RB_CLEAR(>d_rb);
+   } else {
+   /* All allocated dentry objs should be in the tree */
+   BUG_ON(1);
+   }
+}
+
+static  
+struct dentry * drb_insert(struct dentry * dentry)
+{
+   struct rb_node ** p = _tree.rb_node;
+   struct rb_node * parent = NULL;
+   struct rb_node * node= >d_rb;
+   struct dentry  * cur= NULL;
+
+   //  printk("drb_insert: 0x%p (%s)\n",dentry,dentry->d_iname);
+
+   while (*p)
+   {
+   parent = *p;
+   cur = rb_entry(parent, struct dentry, d_rb);
+
+   if (dentry < cur)
+   p = &(*p)->rb_left;
+   else if (dentry > cur)
+   p = &(*p)->rb_right;
+   else {
+   return cur;
+   }
+   }
+
+   rb_link_node(node, parent, p);
+   rb_insert_color(node,_tree); 
+   return NULL;
+}
+
+
 static void d_callback(struct rcu_head *head)
 {
struct dentry * dentry = container_of(head, struct dentry, d_rcu);
@@ -189,6 +239,7 @@ kill_it: {
list_del(>d_child);
dentry_stat.nr_dentry--;/* For d_free, below */
/*drops the locks, at that point nobody can reach this dentry */
+   drb_delete(dentry);
dentry_iput(dentry);
parent = dentry->d_parent;
d_free(dentry);
@@ -351,6 +402,7 @@ static inline void prune_one_dentry(stru
__d_drop(dentry);
list_del(>d_child);
dentry_stat.nr_dentry--;/* For d_free, below */
+   drb_delete(dentry);
dentry_iput(dentry);
parent = dentry->d_parent;
d_free(dentry);
@@ -360,7 +412,7 @@ static inline void prune_one_dentry(stru
 }
 
 /**
- * prune_dcache - shrink the dcache
+ * prune_lru - shrink the lru list
  * @count: number of entries to try and free
  *
  * Shrink the dcache. This is done when we need
@@ -372,7 +424,7 @@ static inline void prune_one_dentry(stru
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_lru(int count)
 {
spin_lock(_lock);
for (; count ; count--) {
@@ -410,6 +462,93 @@ static void prune_dcache(int count)
spin_unlock(_lock);
 }
 
+/**
+ * prune_dcache - try and "intelligently" shrink the dcache
+ * @requested - num of dentrys to try and free
+ *
+ * The basic strategy here is to scan 

[PATCH 7/15] ptwalk: remap_pfn_range

2005-03-09 Thread Hugh Dickins
Convert remap_pfn_range pagetable walkers to loops using p?d_addr_end.
Remove the redundant flush_tlb_range from afterwards: as its comment
noted, there's already a BUG_ON(!pte_none).

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |  151 +++-
 1 files changed, 59 insertions(+), 92 deletions(-)

--- ptwalk6/mm/memory.c 2005-03-09 01:35:49.0 +
+++ ptwalk7/mm/memory.c 2005-03-09 01:37:02.0 +
@@ -1089,97 +1089,74 @@ int zeromap_page_range(struct vm_area_st
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
-static inline void
-remap_pte_range(struct mm_struct *mm, pte_t * pte,
-   unsigned long address, unsigned long size,
-   unsigned long pfn, pgprot_t prot)
-{
-   unsigned long base, end;
-
-   base = address & PMD_MASK;
-   address &= ~PMD_MASK;
-   end = address + size;
-   if (end > PMD_SIZE)
-   end = PMD_SIZE;
+static inline int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+   unsigned long addr, unsigned long end,
+   unsigned long pfn, pgprot_t prot)
+{
+   pte_t *pte;
+
+   pte = pte_alloc_map(mm, pmd, addr);
+   if (!pte)
+   return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-   set_pte_at(mm, base+address, pte, pfn_pte(pfn, prot));
-   address += PAGE_SIZE;
+   set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
-   pte++;
-   } while (address && (address < end));
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+   pte_unmap(pte - 1);
+   return 0;
 }
 
-static inline int
-remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
-   unsigned long size, unsigned long pfn, pgprot_t prot)
-{
-   unsigned long base, end;
-
-   base = address & PUD_MASK;
-   address &= ~PUD_MASK;
-   end = address + size;
-   if (end > PUD_SIZE)
-   end = PUD_SIZE;
-   pfn -= (address >> PAGE_SHIFT);
+static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+   unsigned long addr, unsigned long end,
+   unsigned long pfn, pgprot_t prot)
+{
+   pmd_t *pmd;
+   unsigned long next;
+
+   pfn -= addr >> PAGE_SHIFT;
+   pmd = pmd_alloc(mm, pud, addr);
+   if (!pmd)
+   return -ENOMEM;
do {
-   pte_t * pte = pte_alloc_map(mm, pmd, base + address);
-   if (!pte)
+   next = pmd_addr_end(addr, end);
+   if (remap_pte_range(mm, pmd, addr, next,
+   pfn + (addr >> PAGE_SHIFT), prot))
return -ENOMEM;
-   remap_pte_range(mm, pte, base + address, end - address,
-   (address >> PAGE_SHIFT) + pfn, prot);
-   pte_unmap(pte);
-   address = (address + PMD_SIZE) & PMD_MASK;
-   pmd++;
-   } while (address && (address < end));
+   } while (pmd++, addr = next, addr != end);
return 0;
 }
 
-static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud,
- unsigned long address, unsigned long size,
- unsigned long pfn, pgprot_t prot)
-{
-   unsigned long base, end;
-   int error;
-
-   base = address & PGDIR_MASK;
-   address &= ~PGDIR_MASK;
-   end = address + size;
-   if (end > PGDIR_SIZE)
-   end = PGDIR_SIZE;
-   pfn -= address >> PAGE_SHIFT;
+static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+   unsigned long addr, unsigned long end,
+   unsigned long pfn, pgprot_t prot)
+{
+   pud_t *pud;
+   unsigned long next;
+
+   pfn -= addr >> PAGE_SHIFT;
+   pud = pud_alloc(mm, pgd, addr);
+   if (!pud)
+   return -ENOMEM;
do {
-   pmd_t *pmd = pmd_alloc(mm, pud, base+address);
-   error = -ENOMEM;
-   if (!pmd)
-   break;
-   error = remap_pmd_range(mm, pmd, base + address, end - address,
-   (address >> PAGE_SHIFT) + pfn, prot);
-   if (error)
-   break;
-   address = (address + PUD_SIZE) & PUD_MASK;
-   pud++;
-   } while (address && (address < end));
-   return error;
+   next = pud_addr_end(addr, end);
+   if (remap_pmd_range(mm, pud, addr, next,
+   pfn + (addr >> PAGE_SHIFT), prot))
+   return -ENOMEM;
+   } while (pud++, addr = next, addr != end);
+   return 0;
 

[PATCH 10/15] ptwalk: copy_page_range

2005-03-09 Thread Hugh Dickins
Convert copy_page_range pagetable walkers to loops using p?d_addr_end.
Merge copy_swap_pte into copy_one_pte, make a few minor tidyups.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/memory.c |  141 
 1 files changed, 57 insertions(+), 84 deletions(-)

--- ptwalk9/mm/memory.c 2005-03-09 01:38:00.0 +
+++ ptwalk10/mm/memory.c2005-03-09 01:38:12.0 +
@@ -260,20 +260,7 @@ out:
  */
 
 static inline void
-copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte)
-{
-   if (pte_file(pte))
-   return;
-   swap_duplicate(pte_to_swp_entry(pte));
-   if (list_empty(_mm->mmlist)) {
-   spin_lock(_lock);
-   list_add(_mm->mmlist, _mm->mmlist);
-   spin_unlock(_lock);
-   }
-}
-
-static inline void
-copy_one_pte(struct mm_struct *dst_mm,  struct mm_struct *src_mm,
+copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
unsigned long addr)
 {
@@ -281,12 +268,21 @@ copy_one_pte(struct mm_struct *dst_mm,  
struct page *page;
unsigned long pfn;
 
-   /* pte contains position in swap, so copy. */
-   if (!pte_present(pte)) {
-   copy_swap_pte(dst_mm, src_mm, pte);
+   /* pte contains position in swap or file, so copy. */
+   if (unlikely(!pte_present(pte))) {
+   if (!pte_file(pte)) {
+   swap_duplicate(pte_to_swp_entry(pte));
+   /* make sure dst_mm is on swapoff's mmlist. */
+   if (unlikely(list_empty(_mm->mmlist))) {
+   spin_lock(_lock);
+   list_add(_mm->mmlist, _mm->mmlist);
+   spin_unlock(_lock);
+   }
+   }
set_pte_at(dst_mm, addr, dst_pte, pte);
return;
}
+
pfn = pte_pfn(pte);
/* the pte points outside of valid memory, the
 * mapping is assumed to be good, meaningful
@@ -326,25 +322,21 @@ copy_one_pte(struct mm_struct *dst_mm,  
page_dup_rmap(page);
 }
 
-static int copy_pte_range(struct mm_struct *dst_mm,  struct mm_struct *src_mm,
+static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
pte_t *src_pte, *dst_pte;
-   pte_t *s, *d;
unsigned long vm_flags = vma->vm_flags;
 
 again:
-   d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+   dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
if (!dst_pte)
return -ENOMEM;
+   src_pte = pte_offset_map_nested(src_pmd, addr);
 
spin_lock(_mm->page_table_lock);
-   s = src_pte = pte_offset_map_nested(src_pmd, addr);
-   for (; addr < end; s++, d++) {
-   if (!pte_none(*s))
-   copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr);
-   addr += PAGE_SIZE;
+   do {
/*
 * We are holding two locks at this point - either of them
 * could generate latencies in another task on another CPU.
@@ -353,105 +345,86 @@ again:
need_lockbreak(_mm->page_table_lock) ||
need_lockbreak(_mm->page_table_lock))
break;
-   }
-   pte_unmap_nested(src_pte);
-   pte_unmap(dst_pte);
+   if (pte_none(*src_pte))
+   continue;
+   copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+   } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
spin_unlock(_mm->page_table_lock);
 
+   pte_unmap_nested(src_pte - 1);
+   pte_unmap(dst_pte - 1);
cond_resched_lock(_mm->page_table_lock);
-   if (addr < end)
+   if (addr != end)
goto again;
return 0;
 }
 
-static int copy_pmd_range(struct mm_struct *dst_mm,  struct mm_struct *src_mm,
+static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
pmd_t *src_pmd, *dst_pmd;
-   int err = 0;
unsigned long next;
 
-   src_pmd = pmd_offset(src_pud, addr);
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
if (!dst_pmd)
return -ENOMEM;
-
-   for (; addr < end; addr = next, src_pmd++, dst_pmd++) {
-   next = (addr + PMD_SIZE) & PMD_MASK;
-   if (next > end || next <= addr)
-   next = end;
+   src_pmd = pmd_offset(src_pud, addr);
+   do {
+   next = pmd_addr_end(addr, end);
if 

[PATCH 1/15] ptwalk: p?d_none_or_clear_bad

2005-03-09 Thread Hugh Dickins
Replace the repetitive p?d_none, p?d_bad, p?d_ERROR, p?d_clear clauses
by pgd_none_or_clear_bad, pud_none_or_clear_bad, pmd_none_or_clear_bad
inlines throughout common and i386 - avoids a sprinkling of "unlikely"s.

Tests inline, but unlikely error handling in mm/memory.c - so the ERROR
file and line won't tell much; but it comes too late anyway, and hardly
ever seen outside development.

Let mremap use them in get_one_pte_map, as it already did in _nested;
but leave follow_page and untouched_anonymous page just skipping _bad
as before - they don't have quite the same ownership of the mm.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 arch/i386/kernel/vm86.c   |   21 +
 include/asm-generic/pgtable.h |   44 
 mm/memory.c   |   89 +++---
 mm/mprotect.c |   21 +
 mm/mremap.c   |   24 +++
 mm/msync.c|   21 +
 mm/swapfile.c |   21 +
 mm/vmalloc.c  |   21 +
 8 files changed, 100 insertions(+), 162 deletions(-)

--- 2.6.11-bk5/arch/i386/kernel/vm86.c  2005-03-02 07:38:52.0 +
+++ ptwalk1/arch/i386/kernel/vm86.c 2005-03-09 01:35:49.0 +
@@ -145,29 +145,14 @@ static void mark_screen_rdonly(struct ta
preempt_disable();
spin_lock(>mm->page_table_lock);
pgd = pgd_offset(tsk->mm, 0xA);
-   if (pgd_none(*pgd))
+   if (pgd_none_or_clear_bad(pgd))
goto out;
-   if (pgd_bad(*pgd)) {
-   pgd_ERROR(*pgd);
-   pgd_clear(pgd);
-   goto out;
-   }
pud = pud_offset(pgd, 0xA);
-   if (pud_none(*pud))
-   goto out;
-   if (pud_bad(*pud)) {
-   pud_ERROR(*pud);
-   pud_clear(pud);
+   if (pud_none_or_clear_bad(pud))
goto out;
-   }
pmd = pmd_offset(pud, 0xA);
-   if (pmd_none(*pmd))
-   goto out;
-   if (pmd_bad(*pmd)) {
-   pmd_ERROR(*pmd);
-   pmd_clear(pmd);
+   if (pmd_none_or_clear_bad(pmd))
goto out;
-   }
pte = mapped = pte_offset_map(pmd, 0xA);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
--- 2.6.11-bk5/include/asm-generic/pgtable.h2005-03-09 01:12:48.0 
+
+++ ptwalk1/include/asm-generic/pgtable.h   2005-03-09 01:35:49.0 
+
@@ -135,4 +135,48 @@ static inline void ptep_set_wrprotect(st
 #define pgd_offset_gate(mm, addr)  pgd_offset(mm, addr)
 #endif
 
+#ifndef __ASSEMBLY__
+/*
+ * When walking page tables, we usually want to skip any p?d_none entries;
+ * and any p?d_bad entries - reporting the error before resetting to none.
+ * Do the tests inline, but report and clear the bad entry in mm/memory.c.
+ */
+void pgd_clear_bad(pgd_t *);
+void pud_clear_bad(pud_t *);
+void pmd_clear_bad(pmd_t *);
+
+static inline int pgd_none_or_clear_bad(pgd_t *pgd)
+{
+   if (pgd_none(*pgd))
+   return 1;
+   if (unlikely(pgd_bad(*pgd))) {
+   pgd_clear_bad(pgd);
+   return 1;
+   }
+   return 0;
+}
+
+static inline int pud_none_or_clear_bad(pud_t *pud)
+{
+   if (pud_none(*pud))
+   return 1;
+   if (unlikely(pud_bad(*pud))) {
+   pud_clear_bad(pud);
+   return 1;
+   }
+   return 0;
+}
+
+static inline int pmd_none_or_clear_bad(pmd_t *pmd)
+{
+   if (pmd_none(*pmd))
+   return 1;
+   if (unlikely(pmd_bad(*pmd))) {
+   pmd_clear_bad(pmd);
+   return 1;
+   }
+   return 0;
+}
+#endif /* !__ASSEMBLY__ */
+
 #endif /* _ASM_GENERIC_PGTABLE_H */
--- 2.6.11-bk5/mm/memory.c  2005-03-09 01:12:53.0 +
+++ ptwalk1/mm/memory.c 2005-03-09 01:35:49.0 +
@@ -83,6 +83,30 @@ EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
 /*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none.  Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+   pgd_ERROR(*pgd);
+   pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+   pud_ERROR(*pud);
+   pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+   pmd_ERROR(*pmd);
+   pmd_clear(pmd);
+}
+
+/*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
@@ -90,13 +114,8 @@ static inline void clear_pmd_range(struc
 {
struct page *page;
 
-   if (pmd_none(*pmd))
+   if (pmd_none_or_clear_bad(pmd))
return;
-   if (unlikely(pmd_bad(*pmd))) {
-   pmd_ERROR(*pmd);
-   pmd_clear(pmd);
-   return;
-   }
if (!((start | end) & ~PMD_MASK)) {
  

[PATCH 4/15] ptwalk: unuse_mm

2005-03-09 Thread Hugh Dickins
Convert unuse_process pagetable walkers to loops using p?d_addr_end; but
correct its name to unuse_mm, rename its levels to _range as elsewhere.

Leave unuse_pte out-of-line since it's so rarely called; but move the
funny activate_page inside it.  foundaddr was a leftover from before: we
still want to break out once page is found, but no need to pass addr up.
And we need not comment on the page_table_lock at every level.

Whereas most objects shrink ~200 bytes text, swapfile.o grows slightly:
it had earlier been converted to the addr,end style to fix a 4level bug.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/swapfile.c |  148 +-
 1 files changed, 56 insertions(+), 92 deletions(-)

--- ptwalk3/mm/swapfile.c   2005-03-09 01:35:49.0 +
+++ ptwalk4/mm/swapfile.c   2005-03-09 01:36:25.0 +
@@ -412,154 +412,121 @@ void free_swap_and_cache(swp_entry_t ent
 }
 
 /*
- * The swap entry has been read in advance, and we return 1 to indicate
- * that the page has been used or is no longer needed.
- *
  * Always set the resulting pte to be nowrite (the same as COW pages
  * after one process has exited).  We don't know just how many PTEs will
  * share this swap entry, so be cautious and let do_wp_page work out
  * what to do if a write is requested later.
+ *
+ * vma->vm_mm->page_table_lock is held.
  */
-/* vma->vm_mm->page_table_lock is held */
-static void
-unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
-   swp_entry_t entry, struct page *page)
+static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+   unsigned long addr, swp_entry_t entry, struct page *page)
 {
vma->vm_mm->rss++;
get_page(page);
-   set_pte_at(vma->vm_mm, address, dir,
+   set_pte_at(vma->vm_mm, addr, pte,
   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-   page_add_anon_rmap(page, vma, address);
+   page_add_anon_rmap(page, vma, addr);
swap_free(entry);
+   /*
+* Move the page to the active list so it is not
+* immediately swapped out again after swapon.
+*/
+   activate_page(page);
 }
 
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pmd(struct vm_area_struct *vma, pmd_t *dir,
-   unsigned long address, unsigned long end,
-   swp_entry_t entry, struct page *page)
+static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+   unsigned long addr, unsigned long end,
+   swp_entry_t entry, struct page *page)
 {
pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry);
 
-   if (pmd_none_or_clear_bad(dir))
+   if (pmd_none_or_clear_bad(pmd))
return 0;
-   pte = pte_offset_map(dir, address);
+   pte = pte_offset_map(pmd, addr);
do {
/*
 * swapoff spends a _lot_ of time in this loop!
 * Test inline before going to call unuse_pte.
 */
if (unlikely(pte_same(*pte, swp_pte))) {
-   unuse_pte(vma, address, pte, entry, page);
+   unuse_pte(vma, pte, addr, entry, page);
pte_unmap(pte);
-
-   /*
-* Move the page to the active list so it is not
-* immediately swapped out again after swapon.
-*/
-   activate_page(page);
-
-   /* add 1 since address may be 0 */
-   return 1 + address;
+   return 1;
}
-   address += PAGE_SIZE;
-   pte++;
-   } while (address < end);
+   } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
return 0;
 }
 
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pud(struct vm_area_struct *vma, pud_t *pud,
-unsigned long address, unsigned long end,
-   swp_entry_t entry, struct page *page)
+static int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+   unsigned long addr, unsigned long end,
+   swp_entry_t entry, struct page *page)
 {
pmd_t *pmd;
unsigned long next;
-   unsigned long foundaddr;
 
if (pud_none_or_clear_bad(pud))
return 0;
-   pmd = pmd_offset(pud, address);
+   pmd = pmd_offset(pud, addr);
do {
-   next = (address + PMD_SIZE) & PMD_MASK;
-   if (next > end || !next)
-   next = end;
-   foundaddr = unuse_pmd(vma, pmd, address, next, entry, page);
-   if (foundaddr)
-   return foundaddr;
-   address = next;
-   pmd++;
-   } while (address < end);
+   next = pmd_addr_end(addr, end);
+ 

[PATCH 3/15] ptwalk: sync_page_range

2005-03-09 Thread Hugh Dickins
Convert filemap_sync pagetable walkers to loops using p?d_addr_end; use
similar loop to split filemap_sync into chunks.  Merge filemap_sync_pte
into sync_pte_range, cut filemap_ off the longer names, vma arg first.

There is no error from filemap_sync, nor is any use made of the flags:
if it should do something else for MS_INVALIDATE, reinstate it when that
is implemented.  Remove the redundant flush_tlb_range from afterwards:
as its comment noted, each dirty pte has already been flushed.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/msync.c |  180 ++---
 1 files changed, 67 insertions(+), 113 deletions(-)

--- ptwalk2/mm/msync.c  2005-03-09 01:35:49.0 +
+++ ptwalk3/mm/msync.c  2005-03-09 01:36:14.0 +
@@ -21,155 +21,109 @@
  * Called with mm->page_table_lock held to protect against other
  * threads/the swapper from ripping pte's out from under us.
  */
-static int filemap_sync_pte(pte_t *ptep, struct vm_area_struct *vma,
-   unsigned long address, unsigned int flags)
-{
-   pte_t pte = *ptep;
-   unsigned long pfn = pte_pfn(pte);
-   struct page *page;
 
-   if (pte_present(pte) && pfn_valid(pfn)) {
-   page = pfn_to_page(pfn);
-   if (!PageReserved(page) &&
-   (ptep_clear_flush_dirty(vma, address, ptep) ||
-page_test_and_clear_dirty(page)))
-   set_page_dirty(page);
-   }
-   return 0;
-}
-
-static int filemap_sync_pte_range(pmd_t * pmd,
-   unsigned long address, unsigned long end, 
-   struct vm_area_struct *vma, unsigned int flags)
+static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+   unsigned long addr, unsigned long end)
 {
pte_t *pte;
-   int error;
 
if (pmd_none_or_clear_bad(pmd))
-   return 0;
-   pte = pte_offset_map(pmd, address);
-   if ((address & PMD_MASK) != (end & PMD_MASK))
-   end = (address & PMD_MASK) + PMD_SIZE;
-   error = 0;
+   return;
+   pte = pte_offset_map(pmd, addr);
do {
-   error |= filemap_sync_pte(pte, vma, address, flags);
-   address += PAGE_SIZE;
-   pte++;
-   } while (address && (address < end));
+   unsigned long pfn;
+   struct page *page;
 
-   pte_unmap(pte - 1);
+   if (!pte_present(*pte))
+   continue;
+   pfn = pte_pfn(*pte);
+   if (!pfn_valid(pfn))
+   continue;
+   page = pfn_to_page(pfn);
+   if (PageReserved(page))
+   continue;
 
-   return error;
+   if (ptep_clear_flush_dirty(vma, addr, pte) ||
+   page_test_and_clear_dirty(page))
+   set_page_dirty(page);
+   } while (pte++, addr += PAGE_SIZE, addr != end);
+   pte_unmap(pte - 1);
 }
 
-static inline int filemap_sync_pmd_range(pud_t * pud,
-   unsigned long address, unsigned long end, 
-   struct vm_area_struct *vma, unsigned int flags)
+static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+   unsigned long addr, unsigned long end)
 {
-   pmd_t * pmd;
-   int error;
+   pmd_t *pmd;
+   unsigned long next;
 
if (pud_none_or_clear_bad(pud))
-   return 0;
-   pmd = pmd_offset(pud, address);
-   if ((address & PUD_MASK) != (end & PUD_MASK))
-   end = (address & PUD_MASK) + PUD_SIZE;
-   error = 0;
+   return;
+   pmd = pmd_offset(pud, addr);
do {
-   error |= filemap_sync_pte_range(pmd, address, end, vma, flags);
-   address = (address + PMD_SIZE) & PMD_MASK;
-   pmd++;
-   } while (address && (address < end));
-   return error;
+   next = pmd_addr_end(addr, end);
+   sync_pte_range(vma, pmd, addr, next);
+   } while (pmd++, addr = next, addr != end);
 }
 
-static inline int filemap_sync_pud_range(pgd_t *pgd,
-   unsigned long address, unsigned long end,
-   struct vm_area_struct *vma, unsigned int flags)
+static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+   unsigned long addr, unsigned long end)
 {
pud_t *pud;
-   int error;
+   unsigned long next;
 
if (pgd_none_or_clear_bad(pgd))
-   return 0;
-   pud = pud_offset(pgd, address);
-   if ((address & PGDIR_MASK) != (end & PGDIR_MASK))
-   end = (address & PGDIR_MASK) + PGDIR_SIZE;
-   error = 0;
+   return;
+   pud = pud_offset(pgd, addr);
do {
-   error |= filemap_sync_pmd_range(pud, address, end, vma, flags);
-   address = (address + PUD_SIZE) & PUD_MASK;
-   pud++;
-   } while (address && 

RE: Direct io on block device has performance regression on 2.6.x kernel

2005-03-09 Thread Chen, Kenneth W
Andrew Morton wrote on Wednesday, March 09, 2005 12:05 PM
> "Chen, Kenneth W" <[EMAIL PROTECTED]> wrote:
> > Let me answer the questions in reverse order.  We started with running
> > industry standard transaction processing database benchmark on 2.6 kernel,
> > on real hardware (4P smp, 64 GB memory, 450 disks) running industry
> > standard db application.  What we measured is that with best tuning done
> > to the system, 2.6 kernel has a huge performance regression relative to
> > its predecessor 2.4 kernel (a kernel from RHEL3, 2.4.21 based).
>
> That's news to me.  I thought we were doing OK with big database stuff.
> Surely lots of people have been testing such things.

There are different level of "big" stuff.  We used to work on 32-way numa
box, but other show stopper issues popping up before we get to the I/O stack.
The good thing came out of that work is the removal of global unplug lock.


> > And yes, it is all worth pursuing, the two patches on raw device recuperate
> > 1/3 of the total benchmark performance regression.
>
> On a real disk driver?  hm, I'm wrong then.
>

Yes, on real disk driver (qlogic fiber channel) and with real 15K rpm disks.


> Did you generate a kernel profile?

Top 40 kernel hot functions, percentage is normalized to kernel utilization.

_spin_unlock_irqrestore 23.54%
_spin_unlock_irq19.27%
__blockdev_direct_IO3.57%
follow_hugetlb_page 1.84%
e1000_clean 1.38%
kmem_cache_alloc1.31%
put_page1.29%
__generic_file_aio_read 1.18%
e1000_intr  1.07%
schedule1.01%
dio_bio_complete0.97%
mempool_alloc   0.96%
kmem_cache_free 0.90%
__end_that_request_first0.88%
__copy_user 0.82%
kfree   0.77%
generic_make_request0.73%
_spin_lock  0.73%
kref_put0.73%
vfs_read0.68%
update_atime0.68%
scsi_dispatch_cmd   0.67%
fget_light  0.66%
put_io_context  0.60%
_spin_lock_irqsave  0.58%
scsi_finish_command 0.58%
generic_file_aio_write_nolock   0.57%
inode_times_differ  0.55%
break_fault 0.53%
__do_softirq0.48%
aio_read_evt0.48%
try_atomic_semop0.44%
sys_pread64 0.43%
__bio_add_page  0.43%
__mod_timer 0.42%
bio_alloc   0.41%
scsi_decide_disposition 0.40%
e1000_clean_rx_irq  0.39%
find_vma0.38%
dnotify_parent  0.38%


Profile with spin lock inlined, so that it is easier to see functions
that has the lock contention, again top 40 hot functions:

scsi_request_fn 7.54%
finish_task_switch  6.25%
__blockdev_direct_IO4.97%
__make_request  3.87%
scsi_end_request3.54%
dio_bio_end_io  2.70%
follow_hugetlb_page 2.39%
__wake_up   2.37%
aio_complete1.82%
kmem_cache_alloc1.68%
__mod_timer 1.63%
e1000_clean 1.57%
__generic_file_aio_read 1.42%
mempool_alloc   1.37%
put_page1.35%
e1000_intr  1.31%
schedule1.25%
dio_bio_complete1.20%
scsi_device_unbusy  1.07%
kmem_cache_free 1.06%
__copy_user 1.04%
scsi_dispatch_cmd   1.04%
__end_that_request_first1.04%
generic_make_request1.02%
kfree   0.94%
__aio_get_req   0.93%
sys_pread64 0.83%
get_request 0.79%
put_io_context  0.76%
dnotify_parent  0.73%
vfs_read0.73%
update_atime0.73%
finished_one_bio0.63%
generic_file_aio_write_nolock   0.63%
scsi_put_command0.62%
break_fault 0.62%
e1000_xmit_frame0.62%
aio_read_evt0.59%
scsi_io_completion  0.59%
inode_times_differ  0.58%



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Linux-fbdev-devel] [announce 0/7] fbsplash - The Framebuffer Splash

2005-03-09 Thread Geert Uytterhoeven
On Wed, 9 Mar 2005, Alan Cox wrote:
> On Mer, 2005-03-09 at 09:34, Geert Uytterhoeven wrote:
> > On Wed, 9 Mar 2005, Jon Smirl wrote:
> > > Another idea would be to build a console is user space. Think of it as
> > > a full screen xterm. A user space console has access to full hardware
> > > acceleration using the DRM interface.
> > 
> > Yep. And that's what Alan Cox wanted to do. Console in userspace, eye candy
> > (using Porter-Duff blending) as much as you want, full UTF-8 support, ...
> 
> Jon is the origin of those ideas not me, I'm merely supporting them
> providing there is still a basic kernel side console.

Thanks for correcting me!

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [EMAIL PROTECTED]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/15] ptwalk: map and unmap_vm_area

2005-03-09 Thread Hugh Dickins
Convert unmap_vm_area and map_vm_area pagetable walkers to loops using
p?d_addr_end; rename internal levels vunmap_p??_range, vmap_p??_range.
map_vm_area shows the style when allocating: allocs moved down a level.
Replace KERN_CRIT Whee message by boring WARN_ON.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
---

 mm/vmalloc.c |  216 +--
 1 files changed, 77 insertions(+), 139 deletions(-)

--- ptwalk4/mm/vmalloc.c2005-03-09 01:35:49.0 +
+++ ptwalk5/mm/vmalloc.c2005-03-09 01:36:38.0 +
@@ -23,199 +23,137 @@
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
-static void unmap_area_pte(pmd_t *pmd, unsigned long address,
- unsigned long size)
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
-   unsigned long base, end;
pte_t *pte;
 
if (pmd_none_or_clear_bad(pmd))
return;
 
-   pte = pte_offset_kernel(pmd, address);
-   base = address & PMD_MASK;
-   address &= ~PMD_MASK;
-   end = address + size;
-   if (end > PMD_SIZE)
-   end = PMD_SIZE;
-
+   pte = pte_offset_kernel(pmd, addr);
do {
-   pte_t page;
-   page = ptep_get_and_clear(_mm, base + address, pte);
-   address += PAGE_SIZE;
-   pte++;
-   if (pte_none(page))
-   continue;
-   if (pte_present(page))
-   continue;
-   printk(KERN_CRIT "Whee.. Swapped out page in kernel page 
table\n");
-   } while (address < end);
+   pte_t ptent = ptep_get_and_clear(_mm, addr, pte);
+   WARN_ON(!pte_none(ptent) && !pte_present(ptent));
+   } while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static void unmap_area_pmd(pud_t *pud, unsigned long address,
- unsigned long size)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
 {
-   unsigned long base, end;
pmd_t *pmd;
+   unsigned long next;
 
if (pud_none_or_clear_bad(pud))
return;
 
-   pmd = pmd_offset(pud, address);
-   base = address & PUD_MASK;
-   address &= ~PUD_MASK;
-   end = address + size;
-   if (end > PUD_SIZE)
-   end = PUD_SIZE;
-
+   pmd = pmd_offset(pud, addr);
do {
-   unmap_area_pte(pmd, base + address, end - address);
-   address = (address + PMD_SIZE) & PMD_MASK;
-   pmd++;
-   } while (address < end);
+   next = pmd_addr_end(addr, end);
+   vunmap_pte_range(pmd, addr, next);
+   } while (pmd++, addr = next, addr != end);
 }
 
-static void unmap_area_pud(pgd_t *pgd, unsigned long address,
-  unsigned long size)
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 {
pud_t *pud;
-   unsigned long base, end;
+   unsigned long next;
 
if (pgd_none_or_clear_bad(pgd))
return;
 
-   pud = pud_offset(pgd, address);
-   base = address & PGDIR_MASK;
-   address &= ~PGDIR_MASK;
-   end = address + size;
-   if (end > PGDIR_SIZE)
-   end = PGDIR_SIZE;
+   pud = pud_offset(pgd, addr);
+   do {
+   next = pud_addr_end(addr, end);
+   vunmap_pmd_range(pud, addr, next);
+   } while (pud++, addr = next, addr != end);
+}
+
+void unmap_vm_area(struct vm_struct *area)
+{
+   pgd_t *pgd;
+   unsigned long next;
+   unsigned long addr = (unsigned long) area->addr;
+   unsigned long end = addr + area->size;
 
+   BUG_ON(addr >= end);
+   pgd = pgd_offset_k(addr);
+   flush_cache_vunmap(addr, end);
do {
-   unmap_area_pmd(pud, base + address, end - address);
-   address = (address + PUD_SIZE) & PUD_MASK;
-   pud++;
-   } while (address && (address < end));
-}
-
-static int map_area_pte(pte_t *pte, unsigned long address,
-  unsigned long size, pgprot_t prot,
-  struct page ***pages)
-{
-   unsigned long base, end;
-
-   base = address & PMD_MASK;
-   address &= ~PMD_MASK;
-   end = address + size;
-   if (end > PMD_SIZE)
-   end = PMD_SIZE;
+   next = pgd_addr_end(addr, end);
+   vunmap_pud_range(pgd, addr, next);
+   } while (pgd++, addr = next, addr != end);
+   flush_tlb_kernel_range((unsigned long) area->addr, end);
+}
+
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+   pgprot_t prot, struct page ***pages)
+{
+   pte_t *pte;
 
+   pte = pte_alloc_kernel(_mm, pmd, addr);
+   if (!pte)
+   return -ENOMEM;
do {
struct page *page = **pages;
   

Re: [PATCH] resync ATI PCI idents into base kernel

2005-03-09 Thread Christoph Hellwig
On Wed, Mar 09, 2005 at 03:45:43PM +, Alan Cox wrote:
> On Maw, 2005-03-08 at 22:33, Christoph Hellwig wrote:
> > > Really - so does it go to the PCI maintainer, the IDE maintainer or the
> > > DRI maintainer or someone else, or all of them, or in bits to different
> > > ones remembering there are dependancies and I don't use bitcreeper ?
> > 
> > If you don't know send it to Andrew.  
> 
> You are completely missing the point.

Which is?  That's you're so special you don't need to care about the
workflow the ordinary humans have created?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: huge filesystems

2005-03-09 Thread Miklos Szeredi
> The group I work in has been experimenting with GFS and Lustre, and I did
> some NBD/ENBD experimentation on my own, described at
> http://dcs.nac.uci.edu/~strombrg/nbd.html
> 
> My question is, what is the current status of huge filesystems - IE,
> filesystems that exceed 2 terabytes, and hopefully also exceeding 16
> terabytes?
> 
> Am I correct in assuming that the usual linux buffer cache only goes to 16
> terabytes?

The page cache limit is PAGE_CACHE_BITS + BITS_PER_LONG - 1.  On i386
that's 12 + 32 - 1 = 43 bits or 8Tbytes.  On 64 bit architectures the
size of off_t is the only limit.

> Does the FUSE API (or similar) happen to allow surpassing either the 2T or
> 16T limits?

The API certainly doesn't have any limits.  The page cache limit holds
for FUSE too, though with the direct-io mount option the page cache is
not used, so the limit could be removed as well.  I'll fix that.

Thanks,
Miklos
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Support for GEODE CPUs

2005-03-09 Thread Alan Cox
On Mer, 2005-03-09 at 17:33, Lennart Sorensen wrote:
> Now if the Geode GX1 in fact runs faster optimized for 486 rather than
> 586 (I have been running one as 586tsc since it had mmx and tsc in its
> feature list), then I think I will be recompiling my kernel to see if I
> can't make this 266MHz GX1 run almost as fast as a 400MHz PXA255 (arm).
> Right now it has somewhat lower ethernet bandwidth than the arm.

If you build 486 it will still use the TSC because it is available (The
PIT is buggy but the kernel knows about that anyway and handles it). 

There are a few Geode tricks to know for performance

- Turn off the video
- If you can't turn it off use solid areas of colour to speed the system
up (The hardware uses RLE encoding to reduce ram fetch bandwidth)
- Remember the cache is only 16K (12K when running X11 as 4K is borrowed
for the blitter)
- The onboard audio is a software SB emulation on older GX. It burns
CPU.

Also avoid touching various legacy registers as much as possible, many
cause BIOS traps in SMM emulation code. The list I have is NDA but you
can use rdtsc/inb or outb/rdtsc to work out which 8)

Alan

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/15] ptwalk: pagetable walker cleanup

2005-03-09 Thread Hugh Dickins
Here's a cleanup of the pagetable walkers, in common and i386 code,
based on 2.6.11-bk5.  Mainly to make them all go the same simpler way,
so they're easier to follow with less room for error; but also to reduce
the code size and speed it up a little.  These are janitorial changes,
other arches may follow whenever it suits them.

A few patches slice across all the files, most dice them into nests of
functions to focus on: slicing looks confusing where the originals are
following different conventions.  11/15 works around hang from 10/15.

329 fewer source lines; ~3.5KB less kernel text; lmbench shows good
improvement with 2level pagetables (though not yet back to 2.6.10),
and a much less impressive improvement with 3level pagetables:

1*PIII 512MB2*HT*P4 4GB 2*HT*P4 5GB  
fork exec shfork exec shfork exec sh  
proc proc proc  proc proc proc  proc proc proc
          
152. 541. 3687  249. 989. 4337  353. 1307 5646 2.6.11-bk5
152. 552. 3706  251. 973. 4344  348. 1310 5546
152. 537. 3689  250. 974. 4332  351. 1307 5556
--  --  --
86.4 438. 3471  199. 865. 4167  334. 1279 5499 2.6.11-bk5 + ptwalk
87.4 413. 3478  198. 910. 4176  333. 1261 5462
87.8 415. 3484  199. 870. 4183  331. 1251 5471
--  --  --
79.6 389. 3418  174. 800. 3981  226. 1095 5170 2.6.10
81.5 381. 3442  166. 807. 3986  226. 1093 5074
81.1 385. 3471  165. 800. 3978  227. 1101 5154

 arch/i386/kernel/vm86.c |   21 
 arch/i386/mm/ioremap.c  |  112 ++---
 include/asm-generic/4level-fixup.h  |4 
 include/asm-generic/pgtable-nopmd.h |5 
 include/asm-generic/pgtable-nopud.h |5 
 include/asm-generic/pgtable.h   |   69 +++
 mm/memory.c |  788 ++--
 mm/mprotect.c   |  131 ++---
 mm/mremap.c |   24 -
 mm/msync.c  |  201 +++--
 mm/swapfile.c   |  173 ++-
 mm/vmalloc.c|  246 +++
 12 files changed, 725 insertions(+), 1054 deletions(-)

Hugh
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ANNOUNCE][PATCH 2.6.11 2/3] megaraid_sas: Announcing new mod ule for LSI Logic's SAS based MegaRAID controllers

2005-03-09 Thread 'Christoph Hellwig'
On Wed, Mar 09, 2005 at 09:43:47AM -0500, Bagalkote, Sreenivas wrote:
> During the module load time, I allocate 32 bit or 64 bit SGLs based on
> whether I can receive 64 bit DMA addresses or not. If size of dma_addr_t
> is 4, then I allocate only 32 bit SGLs. During the run time, I prepare 
> 32/64 bit SGLs based on this variable. And since this is compile time
> system-wide property, I kept it as driver global.

Even for kernels with a 64bit dma_addr_t you can get 32bit dma addresses
only.  As a start check whether the pci_set_dma_mask for the 64bit mask
failed - in that case you can always use 32bit SGLs.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] make st seekable again

2005-03-09 Thread Kai Makisara
On Wed, 9 Mar 2005, Alan Cox wrote:

> On Maw, 2005-03-08 at 17:25, Linux Kernel Mailing List wrote:
> > ChangeSet 1.2030, 2005/03/08 09:25:05-08:00, [EMAIL PROTECTED]
> > 
> > [PATCH] make st seekable again
> > 
> > Apparently `tar' errors out if it cannot perform lseek() against a 
> > tape.  Work
> > around that in-kernel.
> 
> Unfortunately this isn't a good idea. Allowing tar to read the tape
> position makes sense, allowing it to zero the position might but you
> have to do major surgery on the driver first because
> 
> 1.It doesn't use ppos
> 2.It doesn't do locking on the ppos at all
> 
> Also allowing apps to randomly seek and report "ok" when they are
> backing up to tape and might really need to see the error is not what
> I'd call stable, professional or quality code.
> 
The proper fix is to fix tar. I have sent an analysis of the problem and a 
suggestion how to fix this to the bug-tar list on March 5 but it is still 
waiting for moderator approval.

While waiting for the application to be fixed, it was decided to restore 
the old behaviour of the tape drivers.

lseek on a tape is not a good fit (addressed by block, blocks on tape can 
have any size, etc.). I don't know any Unix that would really implement 
lseek on tapes but they usually don't return error. This is probably why 
the tar bug has not been found earlier.

There has been one useful way of using lseek() with tapes in some systems. 
Those refuse reads and writes if the file pointer reaches 2 GB. Resetting 
it with lseek(fd,0,0) now and then has allowed writing/reading more than 2 
GB.

I don't think implementing proper read-only lseek for tapes is worth the 
trouble (reliable tracking of the current location is tricky). Purist 
kernels can refuse lseeks. Pragmatic kernels can allow lseeks until 
refusing those won't break common applications.

-- 
Kai
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.11.2

2005-03-09 Thread Wakko Warner
Marcos D. Marado Torres wrote:
> -BEGIN PGP SIGNED MESSAGE-
> Hash: SHA1
> 
> On Wed, 9 Mar 2005, Geert Uytterhoeven wrote:
> 
> >>>which is a patch against the 2.6.11.1 release.  If consensus arrives
> >>>that this patch should be against the 2.6.11 tree, it will be done that
> >>>way in the future.
> >>
> >>IMHO it sould be against 2.6.11 and not 2.6.11.1, like -rc's that are'nt
> >>againt
> >>the last -rc but against 2.6.x.
> >
> >It's a stable release, not a pre/rc, so against 2.6.11.1 sounds most 
> >logical to
> >me.
> 
> Well, yes, _if_ 2.6.12 patch is going to be to aply against 2.6.11.last 
> instead
> of 2.6.11. And, well, either one will cause great panic for hose who aren't 
> and
> the mailing lists and just visit kernel.org to downoad the latest stuff.


IMHO, as long as 2.6.12 patches against 2.6.11, I'm cool with 2.6.11.2
patching against 2.6.11.1, but I think it should patch against 2.6.11
instead


-- 
 Lab tests show that use of micro$oft causes cancer in lab animals
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Large Page Support for ARM (Intel Xscale)

2005-03-09 Thread Ben Dooks
On Tue, Mar 08, 2005 at 05:59:48PM -0800, Venkat Ramakrishnan wrote:
> Hello,
> 
> I am looking for Large Page Implmentation (similar to the hugetlb
> effort) for ARM processors, specifically Xscale. Is there an ongoing
> project any one can point me to?  I tried searching for before posting
> this message but couldn't find anything relevant. If this questions
> had already been asked, please bear with me and point me to the right
> URL.

Try the linux-arm-kernel mailing list, they should be able to
answer these sort of questions.

-- 
Ben ([EMAIL PROTECTED], http://www.fluff.org/)

  'a smiley only costs 4 bytes'
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2.6] Fix i2c messsage flags in video drivers

2005-03-09 Thread Jean Delvare
Hi Chris,

> > While working on the saa7110 driver I found a problem with the way
> > various video drivers (found on Zoran-based boards) prepare i2c
> > messages to be used by i2c_transfer. The drivers improperly copy the
> > i2c client flags as the message flags, while both sets are mostly
> > unrelated. The net effect in this case is to trigger an I2C block
> > read instead of the expected I2C block write. The fix is simply not
> > to pass any flag, because none are needed.
> > 
> > I think this patch qualifies hands down as a "critical bug fix" to
> > be included in whatever bug-fix-only trees exist these days. As far
> > as I can see, all Zoran-based boards are broken in 2.6.11 without
> > this patch.
> 
> Are people reporting this as a problem?

Not that I know. For adv7175 it couldn't be reported so far anyway
because people would hit the oops in saa7110 before (same board: DC10+,
oops fixed in a different patch).

It is possible that people are able to get their board to still work
without my patch, if the chips were properly configured in the first
place and they don't attempt to reconfigure them (like norm change). I
don't know the chips well enough to tell how probable this is.

Thanks,
-- 
Jean Delvare
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: inode cache, dentry cache, buffer heads usage

2005-03-09 Thread Dipankar Sarma
On Wed, Mar 09, 2005 at 01:29:23PM -0800, Badari Pulavarty wrote:
> On Wed, 2005-03-09 at 13:27, Dipankar Sarma wrote:
> > On Wed, Mar 09, 2005 at 10:55:58AM -0800, Badari Pulavarty wrote:
> > > Hi,
> > > 
> > > We have a 8-way P-III, 16GB RAM running 2.6.8-1. We use this as
> > > our server to keep source code, cscopes and do the builds.
> > > This machine seems to slow down over the time. One thing we
> > > keep noticing is it keeps running out of lowmem. Most of 
> > > the lowmem is used for ext3 inode cache + dentry cache +
> > > bufferheads + Buffers. So we did 2:2 split - but it improved
> > > thing, but again run into same issues.
> > > 
> > > So, why is these slab cache are not getting purged/shrinked even
> > > under memory pressure ? (I have seen lowmem as low as 6MB). What
> > > can I do to keep the machine healthy ?
> > 
> > How does /proc/sys/fs/dentry-state look when you run low on lowmem ?
> 
> 
> 
> [EMAIL PROTECTED]:~$ cat /proc/sys/fs/dentry-state
> 1434093 1348947 45  0   0   0
> [EMAIL PROTECTED]:~$ grep dentry /proc/slabinfo
> dentry_cache  1434094 1857519144   271 : tunables  120  
> 608 : slabdata  68797  68797  0

Hmm.. so we are not shrinking dcache despite a large number of
unsed dentries. That is where we need to look. Will dig a bit
tomorrow.

Thanks
Dipankar
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] add timing information to printk messages

2005-03-09 Thread Tony Luck
> Here's a little patch which is useful for showing timing information for
> kernel bootup activities.
> 
> This patch adds a new Kconfig option under "Kernel Hacking" and a new
> option for the kernel command line.  It also provides a script for
> showing delta information.

I'm seeing some odd output with CONFIG_PRINTK_TIME=y during boot.  When
it is set to "no", I see this from "dmesg":

Total of 4 processors activated (7168.96 BogoMIPS).
CPU0 attaching sched-domain:
 domain 0: span f
  groups: 1 2 4 8
CPU1 attaching sched-domain:
 domain 0: span f
  groups: 2 4 8 1
CPU2 attaching sched-domain:
 domain 0: span f
  groups: 4 8 1 2
CPU3 attaching sched-domain:
 domain 0: span f
  groups: 8 1 2 4

Setting CONFIG_PRINTK_TIME=y I see (the "" pieces are actually
each a single ASCII '\0' character):

[0.240887] Total of 4 processors activated (7168.96 BogoMIPS).
[0.240926] CPU0 attaching sched-domain:
[0.240930] PU0 attaching sched-domain:
[0.240933]  domain 0: span f
[0.240967]  f
[0.240969]   groups: 1 2 4 8
[0.241024] CPU1 attaching sched-domain:
[0.241027] PU1 attaching sched-domain:
[0.241030]  domain 0: span f
[0.241063]  f
[0.241065]   groups: 2 4 8 1
[0.241146] CPU2 attaching sched-domain:
[0.241149] PU2 attaching sched-domain:
[0.241151]  domain 0: span f
[0.241186]  f
[0.241188]   groups: 4 8 1 2
[0.241267] CPU3 attaching sched-domain:
[0.241270] PU3 attaching sched-domain:
[0.241273]  domain 0: span f
[0.241307]  f
[0.241309]   groups: 8 1 2 4

At first I thought that the lines that begin with whitespace were causing
the confusion, but there are other lines during boot that are ok.

[This is on an ia64 system ... but these messages come from generic 
kern/sched.c]

-Tony
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ patch 6/7] drivers/serial/jsm: new serial device driver

2005-03-09 Thread Wen Xiong
Greg KH wrote:
On Wed, Mar 09, 2005 at 01:35:41PM -0600, Kilau, Scott wrote:
 

As it stands today, your requirement appears to be that she needs
to yank all diags ioctls and sysfs files before the driver can make
it into the kernel sources.
   

Not all sysfs files, sysfs files are fine, as long as they are
implemented properly, and are there for things that "make sense".
But yes, it should would be easier to accept the driver if the ioctls
were not there :)
thanks,
greg k-h
 

Hi All,
I think Digi's DPA magagement tool has very good user interfaces. I am 
going to change and fix the problem.
Then Greg can decide if he want to pick it up or not.

I will attatch the DPA graphic interface for you next time.
Thanks,
wendy
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.x.y gatekeeper discipline

2005-03-09 Thread Randy.Dunlap
DHollenbeck wrote:
I had hoped that the proper discipline in rejecting non-critical patches 
would have pertained.  I remain unconvinced that the .y releases are 
anything but noise that should have been kept elsewhere.  After reading 
through a patch summary, I see this as typical:

--
 ChangeSet 2005/02/22 20:56:28-05:00, bunk @ stusta.de
 
 

 [diffview]
 
 

[PATCH] drivers/net/via-rhine.c: make a variable static const
This patch makes a needlessly global variable static const.
Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
Signed-off-by: Jeff Garzik <[EMAIL PROTECTED]>
--
It's possible I simply don't get it, but the above description of a 
patch hardly seems like it would qualify for the intentions of the 
2.6.x.y series.

Is this typical, and is this in line with the intent of the x.y series?
If this is going to achieve the objective, the gatekeeper has to be a 
real stubborn, unpopular horse's ass it seems, with a sign on his 
forehead that reads:  GO AWAY AND COME ANOTHER DAY!

Somewhat disappointedly,
Are you looking at 2.6.x.y patches?  I don't think so..
--
~Randy
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: inode cache, dentry cache, buffer heads usage

2005-03-09 Thread Badari Pulavarty
On Wed, 2005-03-09 at 13:27, Dipankar Sarma wrote:
> On Wed, Mar 09, 2005 at 10:55:58AM -0800, Badari Pulavarty wrote:
> > Hi,
> > 
> > We have a 8-way P-III, 16GB RAM running 2.6.8-1. We use this as
> > our server to keep source code, cscopes and do the builds.
> > This machine seems to slow down over the time. One thing we
> > keep noticing is it keeps running out of lowmem. Most of 
> > the lowmem is used for ext3 inode cache + dentry cache +
> > bufferheads + Buffers. So we did 2:2 split - but it improved
> > thing, but again run into same issues.
> > 
> > So, why is these slab cache are not getting purged/shrinked even
> > under memory pressure ? (I have seen lowmem as low as 6MB). What
> > can I do to keep the machine healthy ?
> 
> How does /proc/sys/fs/dentry-state look when you run low on lowmem ?



[EMAIL PROTECTED]:~$ cat /proc/sys/fs/dentry-state
1434093 1348947 45  0   0   0
[EMAIL PROTECTED]:~$ grep dentry /proc/slabinfo
dentry_cache  1434094 1857519144   271 : tunables  120  
608 : slabdata  68797  68797  0
[EMAIL PROTECTED]:~$ cat /proc/meminfo
MemTotal: 16377076 kB
MemFree:   8343724 kB
Buffers:579232 kB
Cached:5051848 kB
SwapCached:  0 kB
Active:2911084 kB
Inactive:  3878044 kB
HighTotal:14548952 kB
HighFree:  8330944 kB
LowTotal:  1828124 kB
LowFree: 12780 kB
SwapTotal:   0 kB
SwapFree:0 kB
Dirty: 216 kB
Writeback:   0 kB
Mapped: 301940 kB
Slab:  1225772 kB
Committed_AS:   771340 kB
PageTables:   5768 kB
VmallocTotal:   114680 kB
VmallocUsed:   312 kB
VmallocChunk:   114368 kB
HugePages_Total: 0
HugePages_Free:  0
Hugepagesize: 2048 kB



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Random number generator in Linux kernel

2005-03-09 Thread Bill Davidsen
Vineet Joglekar wrote:
Hi all,
Can someone please tell me where can I find and which
random/pseudo-random number generator can I use inside the linux
kernel? (2.4.28)
I found out 1 function get_random_bytes() in
linux/drivers/char/random.c but thats not what I want.
I want a function where I will be supplying a seed to that function
as an input, and will get a random number back. If same seed is used,
same number should be generated again.
Without knowing what you're doing I can't say if it justifies having all 
that extra code around, but the stuff from the library, like srand48, 
will do this. You can add the code to your module.

--
   -bill davidsen ([EMAIL PROTECTED])
"The secret to procrastination is to put things off until the
 last possible moment - but no longer"  -me
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] 2.6.10 - direct-io async short read bug

2005-03-09 Thread Badari Pulavarty
On Wed, 2005-03-09 at 11:53, Andrew Morton wrote:
> Suparna Bhattacharya <[EMAIL PROTECTED]> wrote:
> >
> >  >  Solaris, which does forcedirectio as a mount option, actually
> >  > will do buffered I/O on the trailing part.  Consider it like a bounce
> >  > buffer.  That way they don't DMA the trailing data and succeed the I/O.
> >  > The I/O returns actual bytes till EOF, just like read(2) is supposed to.
> >  >  Either this or a fully DMA'd number 4 is really what we should
> >  > do.  If security can only be solved via a bounce buffer, who cares?  If
> >  > the user created themselves a non-aligned file to open O_DIRECT, that's
> >  > their problem if the last part-sector is negligably slower.
> > 
> >  If writes/truncates take care of zeroing out the rest of the sector
> >  on disk, might we still be OK without having to do the bounce buffer
> >  thing ?
> 
> We can probably rely on the rest of the sector outside i_size being zeroed
> anyway.  Because if it contains non-zero gunk then the fs already has a
> problem, and the user can get at that gunk with an expanding truncate and
> mmap() anyway.
> 

Rest of the sector or rest of the block ? Are you implying that, we
already do this, so there is no problem reading beyond EOF to user
buffer ? Or we need to zero out the userbuffer beyond EOF ?


Thanks,
Badari


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: inode cache, dentry cache, buffer heads usage

2005-03-09 Thread Dipankar Sarma
On Wed, Mar 09, 2005 at 10:55:58AM -0800, Badari Pulavarty wrote:
> Hi,
> 
> We have a 8-way P-III, 16GB RAM running 2.6.8-1. We use this as
> our server to keep source code, cscopes and do the builds.
> This machine seems to slow down over the time. One thing we
> keep noticing is it keeps running out of lowmem. Most of 
> the lowmem is used for ext3 inode cache + dentry cache +
> bufferheads + Buffers. So we did 2:2 split - but it improved
> thing, but again run into same issues.
> 
> So, why is these slab cache are not getting purged/shrinked even
> under memory pressure ? (I have seen lowmem as low as 6MB). What
> can I do to keep the machine healthy ?

How does /proc/sys/fs/dentry-state look when you run low on lowmem ?

Thanks
Dipankar
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.x.y gatekeeper discipline

2005-03-09 Thread Chris Wright
* DHollenbeck ([EMAIL PROTECTED]) wrote:
> [PATCH] drivers/net/via-rhine.c: make a variable static const
> 
> This patch makes a needlessly global variable static const.
> 
> Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
> Signed-off-by: Jeff Garzik <[EMAIL PROTECTED]>
> 
> --
> 
> It's possible I simply don't get it, but the above description of a 
> patch hardly seems like it would qualify for the intentions of the 
> 2.6.x.y series.

I think you've confused something.  This patch is not in -stable.
Here's current listing:

http://linux-release.bkbits.net:8080/linux-2.6.11/[EMAIL PROTECTED]

thanks,
-chris
-- 
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.x.y gatekeeper discipline

2005-03-09 Thread Chris Friesen
DHollenbeck wrote:
It's possible I simply don't get it, but the above description of a 
patch hardly seems like it would qualify for the intentions of the 
2.6.x.y series.
Where do you see that patch as being applied in the new .y stable series?
Chris
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rwsem: Make rwsems use interrupt disabling spinlocks

2005-03-09 Thread Arjan van de Ven

> Ingo, we already have a touch_nmi_watchdog() in the sysrq code.  It might be
> worth adding a touch_softlockup_watchdog() wherever we have a
> touch_nmi_watchdog().

or add touch_softlockup_watchdog to touch_nmi_watchdog() instead
and rename it tickle_watchdog() overtime.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Memory Ready Queue Management Algorithm

2005-03-09 Thread maria belliti
Hello,
I wish to be personally CC'ed the answers/comments posted to the list in  
response to this post

I can't trace any reference addressing how the memory associated queue is 
managed?
Please check slide 13 from this link:
http://engr.smu.edu/~kocan/7343spring05/slides/chapter07.ppt

i mean according to  which algorithm the requests queued will be served? 
which process will be given first the priority? is  it first come first 
served

thanks
_
Express yourself with cool new emoticons http://www.msn.co.uk/specials/myemo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] remove dead cyrix/centaur mtrr init code

2005-03-09 Thread Alan Cox
On Mer, 2005-03-09 at 19:09, Andries Brouwer wrote:
> The moment you report that the follow-up patch is fine, we can
> remove the #if 0 and insert the initcalls instead.
> 
> So, all is well today, and we are waiting for your report.

Ok works for me. I'll let you know ASAP.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux dvb alps_tdlb7 removed

2005-03-09 Thread Peter Waechtler
With version 2.6.10 the driver for the tuner frontend from ALPS TDLB7 
was removed.

Why do you think that this is a dead file?
While I'm happy with the work you do for dvb on Linux, and I want to 
thank you for this anyway, my TV does not work anymore! :(

I use a TechoTrend Premium card with that frontend on it. It worked 
fine until 2.6.10.
Can you put it back into mainline? Is there some work to do for 
reinsertion?

regards
Peter
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.11.2

2005-03-09 Thread Randy.Dunlap
Matt Mackall wrote:
On Wed, Mar 09, 2005 at 12:39:23AM -0800, Greg KH wrote:
And to further test this whole -stable system, I've released 2.6.11.2.
It contains one patch, which is already in the -bk tree, and came from
the security team (hence the lack of the longer review cycle).
It's available now in the normal kernel.org places:
kernel.org/pub/linux/kernel/v2.6/patch-2.6.11.2.gz
which is a patch against the 2.6.11.1 release.

Argh! @*#$&!!&! 
I have to Argh this also (with Matt).

If consensus arrives
that this patch should be against the 2.6.11 tree, it will be done that
way in the future.
It would be much easier on users/testers to have to apply
only one patch to base (2.6.11 e.g.) to get to 2.6.x.y
(2.6.11.3 e.g.).  One Patch File.  Not three.
Consensus arrived back when 2.6.8.1 came out.
Please, folks, there are automated tools that "know" about kernel
release numbering and so on. Said tools broke with 2.6.11.1 because it
wasn't in the same place that 2.6.8.1 was and now this breaks with all
precedent by being an interdiff along a branch.
Fixing it in the future is too #*$%* late because you've now turned it
into a special case.

--
~Randy
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] -stable, how it's going to work.

2005-03-09 Thread Chris Wright
* Andi Kleen ([EMAIL PROTECTED]) wrote:
> On Wed, Mar 09, 2005 at 10:28:22AM -0800, Chris Wright wrote:
> > * Andi Kleen ([EMAIL PROTECTED]) wrote:
> > > Greg KH <[EMAIL PROTECTED]> writes:

> > > One rule I'm missing:
> > > 
> > > - It must be accepted to mainline. 
> > 
> > This can violate the principle of keeping fixes simple for -stable tree.
> > And Linus/Andrew don't want to litter mainline with patch series that
> > do simple fix followed by complete fix meant for developement branch.
> 
> But it risks code drift like we had in 2.4 with older kernels 
> having more fixes than the newer kernel. And that way lies madness.
> 
> I think it is very very important to avoid this.
> 
> If you prefer you can rewrite the rule like
> 
> "Fix must in mainline first. In exceptional cases when the fix 
> in mainline is too intrusive or risky a simpler version of the patch
> can be applied to stable. In this case the mainline fix must be already
> accepted. For most cases the full fix should be applied to avoid code drift"

I think we've all agreed that's the intention.

> > I agree, it's a good rule, but these should be small, temporal diffs
> > from mainline.  For example, -ac tree will sometimes do the simpler fix,
> > whereas mainline does proper complete fix.
> 
> You make it sound like all patches are super complicated and 
> not suitable for backporting.

I didn't think I did, that's why I said 'sometimes'.  Just acknowledging
what does really happen.

> > They don't, the security patches should still be reviewed by subsystem
> > maintainer.  Point here is, sometimes there's disclosure coordination
> > happening as well.
> 
> Ok, how does it coordinate with the vendor-sec process? 
> And at what point is the subsystem maintainer notified.

That's part of the vendor coordination mentioned in the policy.  And
subsystem maintainer is notified as part of vetting the issue/solution,
as stated in the policy.

> The security thing seems to be still quite half backed to me...

Take a look at the policy I posted last night and give me suggestions
for improvements.

thanks,
-chris
-- 
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


2.6.x.y gatekeeper discipline

2005-03-09 Thread DHollenbeck
I had hoped that the proper discipline in rejecting non-critical patches 
would have pertained.  I remain unconvinced that the .y releases are 
anything but noise that should have been kept elsewhere.  After reading 
through a patch summary, I see this as typical:

--
 ChangeSet 2005/02/22 20:56:28-05:00, bunk @ stusta.de
 
 [diffview]
 

[PATCH] drivers/net/via-rhine.c: make a variable static const
This patch makes a needlessly global variable static const.
Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
Signed-off-by: Jeff Garzik <[EMAIL PROTECTED]>
--
It's possible I simply don't get it, but the above description of a 
patch hardly seems like it would qualify for the intentions of the 
2.6.x.y series.

Is this typical, and is this in line with the intent of the x.y series?
If this is going to achieve the objective, the gatekeeper has to be a 
real stubborn, unpopular horse's ass it seems, with a sign on his 
forehead that reads:  GO AWAY AND COME ANOTHER DAY!

Somewhat disappointedly,
Dick
--
Please help fix the U.S. software industry before it is too late.
Contact your U.S. representatives with this information:
http://lpf.ai.mit.edu/Patents/industry-at-risk.html
http://www.groklaw.net/article.php?story=20041003041632172
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] 2.6.10 - direct-io async short read bug

2005-03-09 Thread Joel Becker
On Wed, Mar 09, 2005 at 11:53:48AM -0800, Andrew Morton wrote:
> Suparna Bhattacharya <[EMAIL PROTECTED]> wrote:
> >  If writes/truncates take care of zeroing out the rest of the sector
> >  on disk, might we still be OK without having to do the bounce buffer
> >  thing ?
> 
> We can probably rely on the rest of the sector outside i_size being zeroed
> anyway.  Because if it contains non-zero gunk then the fs already has a
> problem, and the user can get at that gunk with an expanding truncate and
> mmap() anyway.

Actually, yeah, even today we rely on block_prepare_write and
friends to handle that trail zeroing.  That all happens after the sector
has been read from disk.  So this should be analogous.

Joel

-- 

Life's Little Instruction Book #396

"Never give anyone a fruitcake."

http://www.jlbec.org/
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: inconsistent kallsyms data [2.6.11-mm2]

2005-03-09 Thread Paulo Marques
Paulo Marques wrote:
[...]
Can you send me privately a tar.bz2 containing your .config, 
.tmp_kallsyms1.S and .tmp_kallsyms2.S so I can try to figure out what's 
going on?
Ok, after some investigation into the files I was able to find out the 
problem.

scripts/kallsyms.c uses a subset of the symbol table to optimize the 
tokens to use to compress the symbols. It does this because using the 
complete set of symbols would be much slower without a significant gain 
in compression.

For some reason, in the files sent by Dominik, two aliased symbols 
change places from the first to the second step of the kallsyms build 
process (__sched_text_start, __down).

Because of this, the subset used for optimization is different and so 
are the tokens selected, producing a 2 byte difference in the total size 
of the compressed symbol names :P

So I must change the sampling algorithm in a way that is robust to 
symbol position changes.

A simple and robust way is to do the sampling on a list of symbols 
sorted by symbol name. This way, even if the symbol positions that are 
given to scripts/kallsyms change, the symbols sampled will be the same.

I'll do the patch to do this and send it ASAP.
--
Paulo Marques - www.grupopie.com
All that is necessary for the triumph of evil is that good men do nothing.
Edmund Burke (1729 - 1797)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: current linus bk, error mounting root

2005-03-09 Thread Jens Axboe
On Wed, Mar 09 2005, Jon Smirl wrote:
> On Wed, 09 Mar 2005 15:31:10 -0500, Jeff Garzik <[EMAIL PROTECTED]> wrote:
> > Well, there are no changes in libata from bk4 to present.  The only
> > thing I see in the -bk4-bk5 increment diff that's immediately noticeable
> > is the barrier stuff.
> 
> bk4 works
> bk5 is broken
> 
> Where are these *.key files? Maybe I can do some more divide and
> conquer in bitkeeper.

probably not worth the bother, looks like barrier problems. get the
serial console running instead and send the full output, I'll take a
look in the morning.

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.11.2

2005-03-09 Thread Matt Mackall
On Wed, Mar 09, 2005 at 12:39:23AM -0800, Greg KH wrote:
> And to further test this whole -stable system, I've released 2.6.11.2.
> It contains one patch, which is already in the -bk tree, and came from
> the security team (hence the lack of the longer review cycle).
> 
> It's available now in the normal kernel.org places:
>   kernel.org/pub/linux/kernel/v2.6/patch-2.6.11.2.gz
> which is a patch against the 2.6.11.1 release.

Argh! @*#$&!!&! 

> If consensus arrives
> that this patch should be against the 2.6.11 tree, it will be done that
> way in the future.

Consensus arrived back when 2.6.8.1 came out.

Please, folks, there are automated tools that "know" about kernel
release numbering and so on. Said tools broke with 2.6.11.1 because it
wasn't in the same place that 2.6.8.1 was and now this breaks with all
precedent by being an interdiff along a branch.

Fixing it in the future is too #*$%* late because you've now turned it
into a special case.

-- 
Mathematics is the supreme nostalgia of our time.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.6.11.2

2005-03-09 Thread Bill Davidsen
Greg KH wrote:
And to further test this whole -stable system, I've released 2.6.11.2.
It contains one patch, which is already in the -bk tree, and came from
the security team (hence the lack of the longer review cycle).
It's available now in the normal kernel.org places:
kernel.org/pub/linux/kernel/v2.6/patch-2.6.11.2.gz
which is a patch against the 2.6.11.1 release.  If consensus arrives
that this patch should be against the 2.6.11 tree, it will be done that
way in the future.
I think you need both x.y.z=>x.y.z.N and x.y.z.N-1=>x.y.z.N patches. My 
systems which are following the -stable will just need the most recent, 
but doing x.y.z-1=>x.y.z.N gets really ugly for higher values of N.

It can be automated, it's just two (presumably tiny) patchsets per release.
--
   -bill davidsen ([EMAIL PROTECTED])
"The secret to procrastination is to put things off until the
 last possible moment - but no longer"  -me
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] make st seekable again

2005-03-09 Thread Alan Cox
On Maw, 2005-03-08 at 17:25, Linux Kernel Mailing List wrote:
> ChangeSet 1.2030, 2005/03/08 09:25:05-08:00, [EMAIL PROTECTED]
> 
>   [PATCH] make st seekable again
>   
>   Apparently `tar' errors out if it cannot perform lseek() against a 
> tape.  Work
>   around that in-kernel.

Unfortunately this isn't a good idea. Allowing tar to read the tape
position makes sense, allowing it to zero the position might but you
have to do major surgery on the driver first because

1.  It doesn't use ppos
2.  It doesn't do locking on the ppos at all

Also allowing apps to randomly seek and report "ok" when they are
backing up to tape and might really need to see the error is not what
I'd call stable, professional or quality code.

I oppose this change for 2.6.11.3, I think 2.6.12 needs to address the
rest of the mess in that code to make it work (or implement a 'read
only' llseek and
use ppos right)

And -ac won't carry this change.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.4.30-pre3

2005-03-09 Thread Marcelo Tosatti
On Wed, Mar 09, 2005 at 08:40:00PM +, Alan Cox wrote:
> >   o Cset exclude: [EMAIL PROTECTED]|ChangeSet|20041125155150|65356
> >   o Allow lseek on SCSI tapes
> >   o Allow lseek on osst to keep tar --verify happy
> 
> This seems odd since the scsi tape drives don't support lseek and the
> driver changes to correctly block it were part of the security fixes for
> lseek mishandling in 2.6 ? 

Thing is we can't cope with "tar --verify" failing on tapes. Better approach 
to fix this is welcome.

> Does anyone have the straces of tar breaking and the versions ?

Here is the relevant information.


Date: Wed, 2 Mar 2005 23:17:19 +0200 (EET)
From: Kai Makisara <[EMAIL PROTECTED]>
X-X-Sender: [EMAIL PROTECTED]
To: Marcelo Tosatti <[EMAIL PROTECTED]>
Cc: Mark Yeatman <[EMAIL PROTECTED]>, linux-kernel@vger.kernel.org,
Andrew Morton <[EMAIL PROTECTED]>, Gene Heskett <[EMAIL PROTECTED]>
Subject: Re: Problems with SCSI tape rewind / verify on 2.4.29

> On Wed, Mar 02, 2005 at 11:15:42AM -, Mark Yeatman wrote:
> > Hi
> > 
> > Never had to log a bug before, hope this is correctly done.
> > 
> > Thanks
> > 
> > Mark
> > 
> > Detail
> > 
> > [1.] One line summary of the problem:
> > SCSI tape drive is refusing to rewind after backup to allow verify and
> > causing illegal seek error
> > 
> > [2.] Full description of the problem/report:
> > On backup the tape drive is reporting the following error and failing
> > it's backups.
> > 
> > tar: /dev/st0: Warning: Cannot seek: Illegal seek
> > 
> > I have traced this back to failing at an upgrade of the kernel to 2.4.29
> > on Feb 8th. The backups have not worked since. Replacement Drives have
> > been tried and cables to no avail. I noticed in the the changelog that a
> > patch by Solar Designer to the Scsi tape return code had been made. 

BTW, this "fix" by Solar Designer introduces a bug to 2.4.29: a tape 
driver is supposed to return ENOMEM in the case that was changed to return 
EIO ;-(

> 
> v2.6 also contains the same problem BTW.
> 
> Try this:
> 
> --- a/drivers/scsi/st.c.orig  2005-03-02 09:02:13.637158144 -0300
> +++ b/drivers/scsi/st.c   2005-03-02 09:02:20.208159200 -0300
> @@ -3778,7 +3778,6 @@
>   read:   st_read,
>   write:  st_write,
>   ioctl:  st_ioctl,
> - llseek: no_llseek,
>   open:   st_open,
>   flush:  st_flush,
>   release:st_release,

This change covers up the problem. The real bug is in tar. The following 
code is from tar is supposed to reposition the tape to the beginning of 
the file jus written:

#ifdef MTIOCTOP
  {
struct mtop operation;
int status;

operation.mt_op = MTBSF;
operation.mt_count = 1;
if (status = rmtioctl (archive, MTIOCTOP, (char *) ), status 
< 0)
  {
if (errno != EIO
|| (status = rmtioctl (archive, MTIOCTOP, (char *) 
),
status < 0))
  {
#endif
if (rmtlseek (archive, (off_t) 0, SEEK_SET) != 0)
  {
/* Lseek failed.  Try a different method.  */
seek_warn (archive_name_array[0]);
return;
  }
#ifdef MTIOCTOP
  }
  }
  }
#endif


Here is output from strace showing what happens with 'tar -c -W' applied 
at the beginning of the tape (this is using kernel 2.6.11-rc4 but the same 
probably happens with 2.4.29):
...
ioctl(3, MGSL_IOCGPARAMS or MTIOCTOP or SNDCTL_MIDI_MPUMODE, 
0x7fffecd0) = -1 EIO (Input/output error)
ioctl(3, MGSL_IOCGPARAMS or MTIOCTOP or SNDCTL_MIDI_MPUMODE, 
0x7fffecd0) = -1 EIO (Input/output error)
lseek(3, 0, SEEK_SET)   = -1 ESPIPE (Illegal seek)

So, both tape positioning commands fail and the code falls back to lseek. 
Earlier it has returned success even though it has not done anything (this 
was on purpose because it is the way some other Unices behave and with 
reason). In that case this tar succeeded but it was pure luck. The first 
BSF did position the tape correctly although it did fail.

The 2.6 st driver does contain this near the beginning of st_open():

nonseekable_open(inode, filp);

This probably makes lseek fail. This code has been in st.c since 2.6.8.




-- 
Kai
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: PATCH: 2.6.11-ac2

2005-03-09 Thread Rafael J. Wysocki
On Wednesday, 9 of March 2005 20:31, Jason Lunz wrote:
> [EMAIL PROTECTED] said:
> > You know what would be really useful... if www.kernel.org listed the
> > "latest -ac" patch as something current instead of 2.6.10-ac12, which was
> > a great patch in its day, but hasn't been current for a while.
> >
> > In fairness, the -mm is out of date, too. Perhaps a bit of automation
> > would be appropriate here, so that no one would have to update this
> > manually.
> 
> I could have sworn it showed 2.6.11-ac1 for a while. Maybe the 2.6.11.2
> stuff broke it somehow?

Surely it did.

Greets,
Rafael


-- 
- Would you tell me, please, which way I ought to go from here?
- That depends a good deal on where you want to get to.
-- Lewis Carroll "Alice's Adventures in Wonderland"
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: current linus bk, error mounting root

2005-03-09 Thread Jeff Garzik
Jon Smirl wrote:
On Wed, 09 Mar 2005 12:16:44 -0500, Jeff Garzik <[EMAIL PROTECTED]> wrote:
Jon Smirl wrote:
Something in the last 24hrs in linus bk broke my ability to mount root:
Creating root device
Mounting root filesystem
mount: error 6 mounting ext3
mount: error 2 mounting none
Switching to new root
Switchroot: mount failed 22
umount /initrd/dev failed: 2
If I back off a day everything works again.
Root is on Intel ICH5 SATA drive.
dmesg output?
Can you verify that -bk4 works, and -bk5 breaks?

bk4 works. I don't have a serial port hooked up so there is no way to
get dmesg, but I don't see anything obvious on the screen scrolling
by.
I'll check bk5 next.
It would be much more convenient if the bkN releases were tagged in Linus bk.
Well, there are no changes in libata from bk4 to present.  The only 
thing I see in the -bk4-bk5 increment diff that's immediately noticeable 
is the barrier stuff.

Jeff
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: current linus bk, error mounting root

2005-03-09 Thread Jon Smirl
On Wed, 09 Mar 2005 15:31:10 -0500, Jeff Garzik <[EMAIL PROTECTED]> wrote:
> Well, there are no changes in libata from bk4 to present.  The only
> thing I see in the -bk4-bk5 increment diff that's immediately noticeable
> is the barrier stuff.

bk4 works
bk5 is broken

Where are these *.key files? Maybe I can do some more divide and
conquer in bitkeeper.

-- 
Jon Smirl
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH libata-2.6] AHCI: fix fatal error int handling

2005-03-09 Thread Brett Russ
I noticed that the AHCI CI (cmd issue) reg wasn't getting cleared
after error ints resulting in no further commands being successfully
issued to the port.  This patch fixes.  All that's really needed is
the 1's complement but I also removed the disabling/enabling of the
FIS_RX b/c this isn't spec'd as necessary when handling error ints.

Signed-off-by: Brett Russ <[EMAIL PROTECTED]>

= drivers/scsi/ahci.c 1.17 vs edited =
--- 1.17/drivers/scsi/ahci.c2005-02-24 14:52:41 -05:00
+++ edited/drivers/scsi/ahci.c  2005-03-09 15:30:06 -05:00
@@ -538,7 +538,7 @@
 
/* stop DMA */
tmp = readl(port_mmio + PORT_CMD);
-   tmp &= PORT_CMD_START | PORT_CMD_FIS_RX;
+   tmp &= ~PORT_CMD_START;
writel(tmp, port_mmio + PORT_CMD);
 
/* wait for engine to stop.  TODO: this could be
@@ -570,7 +570,7 @@
 
/* re-start DMA */
tmp = readl(port_mmio + PORT_CMD);
-   tmp |= PORT_CMD_START | PORT_CMD_FIS_RX;
+   tmp |= PORT_CMD_START;
writel(tmp, port_mmio + PORT_CMD);
readl(port_mmio + PORT_CMD); /* flush */
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: current linus bk, error mounting root

2005-03-09 Thread Jeff Garzik
Jon Smirl wrote:
On Wed, 09 Mar 2005 12:16:44 -0500, Jeff Garzik <[EMAIL PROTECTED]> wrote:
Jon Smirl wrote:
Something in the last 24hrs in linus bk broke my ability to mount root:
Creating root device
Mounting root filesystem
mount: error 6 mounting ext3
mount: error 2 mounting none
Switching to new root
Switchroot: mount failed 22
umount /initrd/dev failed: 2
If I back off a day everything works again.
Root is on Intel ICH5 SATA drive.
dmesg output?
Can you verify that -bk4 works, and -bk5 breaks?

bk4 works. I don't have a serial port hooked up so there is no way to
get dmesg, but I don't see anything obvious on the screen scrolling
by.
I'll check bk5 next.
It would be much more convenient if the bkN releases were tagged in Linus bk.
No need for tags, that's what the *.key file is for.
Jeff

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: VIA Rhine ethernet driver bug

2005-03-09 Thread jerome lacoste
On Sat, 15 Jan 2005 12:43:33 +0100, Udo van den Heuvel <[EMAIL PROTECTED]> 
wrote:
> Hello,
> 
> On my firewall (VIA EPIA CL-6000 with VIA Rhine network chips running FC3
> and custom kernels) I see messages like:
> 
> Jan 13 19:35:46 epia kernel: eth1: Oversized Ethernet frame spanned multiple
> buffers, entry 0x4 length 0 status 0600!

That might be interesting to someone:

My VIA EPIA based machine  was working well until some minutes ago. I
accidently removed the power supply and the machine rebooted. From
then on I didn't have network anymore. The ethernet card (VIA Rhine
II) (static, not dhcp) was not working and "Oversized "  messages
were printed on the console.

I've rebooted twice and the network didn't still come up. Pinging a
machine on my LAN and pinging the box back resulted in > 98% of the
packets lost.

So I stopped the machine, let it rest for a minute or so and booted
again. That solved the problem.

So if you see the same message as Udo, try to let your box rest.

Jerome
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: current linus bk, error mounting root

2005-03-09 Thread Steven Cole
Jon Smirl wrote:
On Wed, 09 Mar 2005 12:16:44 -0500, Jeff Garzik <[EMAIL PROTECTED]> wrote:
Jon Smirl wrote:
Something in the last 24hrs in linus bk broke my ability to mount root:
Creating root device
Mounting root filesystem
mount: error 6 mounting ext3
mount: error 2 mounting none
Switching to new root
Switchroot: mount failed 22
umount /initrd/dev failed: 2
If I back off a day everything works again.
Root is on Intel ICH5 SATA drive.
dmesg output?
Can you verify that -bk4 works, and -bk5 breaks?

bk4 works. I don't have a serial port hooked up so there is no way to
get dmesg, but I don't see anything obvious on the screen scrolling
by.
I'll check bk5 next.
It would be much more convenient if the bkN releases were tagged in Linus bk.
Yes, and name them -preN instead. ;)
I had a slightly different problem mounting root with an earlier -mm, which
was fixed by setting CONFIG_BASE_FULL=y.  I saw that option enter the
Linus tree recently, so that might be something you could try.
Steven
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Linux-fbdev-devel] [announce 0/7] fbsplash - The Framebuffer Splash

2005-03-09 Thread James Simmons

> On Mer, 2005-03-09 at 09:34, Geert Uytterhoeven wrote:
> > On Wed, 9 Mar 2005, Jon Smirl wrote:
> > > Another idea would be to build a console is user space. Think of it as
> > > a full screen xterm. A user space console has access to full hardware
> > > acceleration using the DRM interface.
> > 
> > Yep. And that's what Alan Cox wanted to do. Console in userspace, eye candy
> > (using Porter-Duff blending) as much as you want, full UTF-8 support, ...
> 
> Jon is the origin of those ideas not me, I'm merely supporting them
> providing there is still a basic kernel side console.

Thank you. We need some kind of basic console in the kernel. I'm not the 
biggest fan of eye candy. So moving the console to userspace for eye candy 
is a dumb idea.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.11 on AMD64 traps

2005-03-09 Thread Patrick McHardy
Michal Vanco wrote:
On Wednesday 09 March 2005 20:45, Patrick McHardy wrote:
This patch should fix it. The crash is caused by stale pointers,
the pointers in fib_iter_state are not reloaded after seq->stop()
followed by seq->start(pos > 0).
Well. Trap vanished after applying this patch, but another weird thing 
occurs:
# ip route show | wc -l
156033
# date; time ip route show > /dev/null; date; time netstat -rn > /dev/null
Wed Mar  9 22:15:21 CET 2005
real0m0.656s
user0m0.415s
sys 0m0.242s
Wed Mar  9 22:15:22 CET 2005
real6m41.472s
user0m1.261s
sys 6m40.143s
Yes, I know it is totally inefficient. Just use ip route, which doesn't
suffer from this problem.
Regards
Patrick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 2.4.30-pre3

2005-03-09 Thread Alan Cox
>   o Cset exclude: [EMAIL PROTECTED]|ChangeSet|20041125155150|65356
>   o Allow lseek on SCSI tapes
>   o Allow lseek on osst to keep tar --verify happy

This seems odd since the scsi tape drives don't support lseek and the
driver changes to correctly block it were part of the security fixes for
lseek mishandling in 2.6 ?

Does anyone have the straces of tar breaking and the versions ?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] consolidate interrupt-related constants

2005-03-09 Thread Stas Sergeev
Hello.
Attached patch moves the three
interrupt-related constants, namely
SA_PROBE, SA_SAMPLE_RANDOM and
SA_SHIRQ, from the arch-specific
headers to the generic header
linux/signal.h. Now, as the interrupt
handling code was recently consolidated,
it looks likely that the related flags
have to be either.
This is mainly only a cleanup, but
it also makes a common place for
adding the new arch-independant flags
for interrupt handling.
For my drivers I just used to add such
a flags into an i386-specific header
since the drivers were intended to work
only on that hardware. But now, as the
interrupt handling code in kernel was
consolidated, adding such a flags to the
arch-specific header breaks the compilation
for other archs, so I thought having a
common place for that arch-independant
flags would be good.
The problem with that patch is that
I can't test it on something other
than x86. But it may work as expected.
Signed-off-by: Stas Sergeev <[EMAIL PROTECTED]>
diff -ur linux-2.6.11/include/asm-alpha/signal.h linux-2.6.11-hdr/include/asm-alpha/signal.h
--- linux-2.6.11/include/asm-alpha/signal.h	2005-01-17 09:36:46.0 +0300
+++ linux-2.6.11-hdr/include/asm-alpha/signal.h	2005-03-09 11:17:45.0 +0300
@@ -109,20 +109,6 @@
 #define MINSIGSTKSZ	4096
 #define SIGSTKSZ	16384
 
-
-#ifdef __KERNEL__
-/*
- * These values of sa_flags are used only by the kernel as part of the
- * irq handling routines.
- *
- * SA_INTERRUPT is also used by the irq handling routines.
- * SA_SHIRQ is for shared interrupt support on PCI and EISA.
- */
-#define SA_PROBE		SA_ONESHOT
-#define SA_SAMPLE_RANDOM	SA_RESTART
-#define SA_SHIRQ		0x4000
-#endif
-
 #define SIG_BLOCK  1	/* for blocking signals */
 #define SIG_UNBLOCK2	/* for unblocking signals */
 #define SIG_SETMASK3	/* for setting the signal mask */
diff -ur linux-2.6.11/include/asm-arm/signal.h linux-2.6.11-hdr/include/asm-arm/signal.h
--- linux-2.6.11/include/asm-arm/signal.h	2005-01-17 09:36:41.0 +0300
+++ linux-2.6.11-hdr/include/asm-arm/signal.h	2005-03-09 11:33:19.0 +0300
@@ -114,18 +114,7 @@
 #define SIGSTKSZ	8192
 
 #ifdef __KERNEL__
-
-/*
- * These values of sa_flags are used only by the kernel as part of the
- * irq handling routines.
- *
- * SA_INTERRUPT is also used by the irq handling routines.
- * SA_SHIRQ is for shared interrupt support on PCI and EISA.
- */
-#define SA_PROBE		0x8000
-#define SA_SAMPLE_RANDOM	0x1000
 #define SA_IRQNOMASK		0x0800
-#define SA_SHIRQ		0x0400
 #endif
 
 #define SIG_BLOCK  0	/* for blocking signals */
diff -ur linux-2.6.11/include/asm-arm26/signal.h linux-2.6.11-hdr/include/asm-arm26/signal.h
--- linux-2.6.11/include/asm-arm26/signal.h	2005-01-17 09:36:47.0 +0300
+++ linux-2.6.11-hdr/include/asm-arm26/signal.h	2005-03-09 11:39:28.0 +0300
@@ -114,18 +114,7 @@
 #define SIGSTKSZ	8192
 
 #ifdef __KERNEL__
-
-/*
- * These values of sa_flags are used only by the kernel as part of the
- * irq handling routines.
- *
- * SA_INTERRUPT is also used by the irq handling routines.
- * SA_SHIRQ is for shared interrupt support on PCI and EISA.
- */
-#define SA_PROBE		0x8000
-#define SA_SAMPLE_RANDOM	0x1000
 #define SA_IRQNOMASK		0x0800
-#define SA_SHIRQ		0x0400
 #endif
 
 #define SIG_BLOCK  0	/* for blocking signals */
diff -ur linux-2.6.11/include/asm-cris/signal.h linux-2.6.11-hdr/include/asm-cris/signal.h
--- linux-2.6.11/include/asm-cris/signal.h	2005-01-17 09:36:38.0 +0300
+++ linux-2.6.11-hdr/include/asm-cris/signal.h	2005-03-09 11:34:52.0 +0300
@@ -108,20 +108,6 @@
 #define MINSIGSTKSZ	2048
 #define SIGSTKSZ	8192
 
-#ifdef __KERNEL__
-
-/*
- * These values of sa_flags are used only by the kernel as part of the
- * irq handling routines.
- *
- * SA_INTERRUPT is also used by the irq handling routines.
- * SA_SHIRQ is for shared interrupt support
- */
-#define SA_PROBE		SA_ONESHOT
-#define SA_SAMPLE_RANDOM	SA_RESTART
-#define SA_SHIRQ		0x0400
-#endif
-
 #define SIG_BLOCK  0	/* for blocking signals */
 #define SIG_UNBLOCK1	/* for unblocking signals */
 #define SIG_SETMASK2	/* for setting the signal mask */
diff -ur linux-2.6.11/include/asm-frv/signal.h linux-2.6.11-hdr/include/asm-frv/signal.h
--- linux-2.6.11/include/asm-frv/signal.h	2005-01-17 09:42:36.0 +0300
+++ linux-2.6.11-hdr/include/asm-frv/signal.h	2005-03-09 11:16:45.0 +0300
@@ -107,20 +107,6 @@
 #define MINSIGSTKSZ	2048
 #define SIGSTKSZ	8192
 
-#ifdef __KERNEL__
-
-/*
- * These values of sa_flags are used only by the kernel as part of the
- * irq handling routines.
- *
- * SA_INTERRUPT is also used by the irq handling routines.
- * SA_SHIRQ is for shared interrupt support on PCI and EISA.
- */
-#define SA_PROBE		SA_ONESHOT
-#define SA_SAMPLE_RANDOM	SA_RESTART
-#define SA_SHIRQ		0x0400
-#endif
-
 #define SIG_BLOCK  0	/* for blocking signals */
 #define SIG_UNBLOCK1	/* for unblocking signals */
 

Re: 2.6.11 on AMD64 traps

2005-03-09 Thread Michal Vanco
On Wednesday 09 March 2005 20:45, Patrick McHardy wrote:
> > Michal Vanco wrote:
> >> I see this problem running 2.6.11 on dual AMD64:
> >>
> >> Running quagga routing daemon (ospf+bgp) and issuing "netstat -rn |wc
> >> -l" command
> >> while quagga tries to load more than 154000 routes from its bgp
> >> neighbours causes this trap:
>
> This patch should fix it. The crash is caused by stale pointers,
> the pointers in fib_iter_state are not reloaded after seq->stop()
> followed by seq->start(pos > 0).

Well. Trap vanished after applying this patch, but another weird thing occurs:

# ip route show | wc -l
156033
# date; time ip route show > /dev/null; date; time netstat -rn > /dev/null
Wed Mar  9 22:15:21 CET 2005

real0m0.656s
user0m0.415s
sys 0m0.242s
Wed Mar  9 22:15:22 CET 2005

real6m41.472s
user0m1.261s
sys 6m40.143s

regards,
-- 
Ing. Michal VanÄo
Network Engineer
SATRO s.r.o.
e-mail: [EMAIL PROTECTED]


pgpv1hSasHahf.pgp
Description: PGP signature


[BK PATCHES] 2.6.x net driver oops fixes

2005-03-09 Thread Jeff Garzik
 Please do a

bk pull bk://gkernel.bkbits.net/net-drivers-2.6

This will update the following files:

 drivers/net/sis900.c|   41 +
 drivers/net/via-rhine.c |3 +++
 2 files changed, 24 insertions(+), 20 deletions(-)

through these ChangeSets:

Herbert Xu:
  o sis900 kernel oops fix

Olof Johansson:
  o [VIA RHINE] older chips oops on shutdown

diff -Nru a/drivers/net/sis900.c b/drivers/net/sis900.c
--- a/drivers/net/sis900.c  2005-03-09 15:16:53 -05:00
+++ b/drivers/net/sis900.c  2005-03-09 15:16:53 -05:00
@@ -245,7 +245,7 @@
signature = (u16) read_eeprom(ioaddr, EEPROMSignature);
if (signature == 0x || signature == 0x) {
printk (KERN_WARNING "%s: Error EERPOM read %x\n", 
-   net_dev->name, signature);
+   pci_name(pci_dev), signature);
return 0;
}
 
@@ -277,7 +277,8 @@
if (!isa_bridge)
isa_bridge = pci_get_device(PCI_VENDOR_ID_SI, 0x0018, 
isa_bridge);
if (!isa_bridge) {
-   printk(KERN_WARNING "%s: Can not find ISA bridge\n", 
net_dev->name);
+   printk(KERN_WARNING "%s: Can not find ISA bridge\n",
+  pci_name(pci_dev));
return 0;
}
pci_read_config_byte(isa_bridge, 0x48, );
@@ -396,6 +397,7 @@
long ioaddr;
int i, ret;
char *card_name = card_names[pci_id->driver_data];
+   const char *dev_name = pci_name(pci_dev);
 
 /* when built into the kernel, we only print version if device is found */
 #ifndef MODULE
@@ -473,17 +475,13 @@
sis_priv->msg_enable = sis900_debug;
else
sis_priv->msg_enable = SIS900_DEF_MSG;
-
-   ret = register_netdev(net_dev);
-   if (ret)
-   goto err_unmap_rx;

/* Get Mac address according to the chip revision */
pci_read_config_byte(pci_dev, PCI_CLASS_REVISION, 
&(sis_priv->chipset_rev));
if(netif_msg_probe(sis_priv))
printk(KERN_DEBUG "%s: detected revision %2.2x, "
"trying to get MAC address...\n",
-   net_dev->name, sis_priv->chipset_rev);
+   dev_name, sis_priv->chipset_rev);

ret = 0;
if (sis_priv->chipset_rev == SIS630E_900_REV)
@@ -496,9 +494,9 @@
ret = sis900_get_mac_addr(pci_dev, net_dev);
 
if (ret == 0) {
-   printk(KERN_WARNING "%s: Cannot read MAC address.\n", 
net_dev->name);
+   printk(KERN_WARNING "%s: Cannot read MAC address.\n", dev_name);
ret = -ENODEV;
-   goto err_out_unregister;
+   goto err_unmap_rx;
}

/* 630ET : set the mii access mode as software-mode */
@@ -507,9 +505,10 @@
 
/* probe for mii transceiver */
if (sis900_mii_probe(net_dev) == 0) {
-   printk(KERN_WARNING "%s: Error probing MII device.\n", 
net_dev->name);
+   printk(KERN_WARNING "%s: Error probing MII device.\n",
+  dev_name);
ret = -ENODEV;
-   goto err_out_unregister;
+   goto err_unmap_rx;
}
 
/* save our host bridge revision */
@@ -519,6 +518,10 @@
pci_dev_put(dev);
}
 
+   ret = register_netdev(net_dev);
+   if (ret)
+   goto err_unmap_rx;
+
/* print some information about our NIC */
printk(KERN_INFO "%s: %s at %#lx, IRQ %d, ", net_dev->name,
   card_name, ioaddr, net_dev->irq);
@@ -528,8 +531,6 @@
 
return 0;
 
- err_out_unregister:
-   unregister_netdev(net_dev);
  err_unmap_rx:
pci_free_consistent(pci_dev, RX_TOTAL_SIZE, sis_priv->rx_ring,
sis_priv->rx_ring_dma);
@@ -556,6 +557,7 @@
 static int __init sis900_mii_probe(struct net_device * net_dev)
 {
struct sis900_private * sis_priv = net_dev->priv;
+   const char *dev_name = pci_name(sis_priv->pci_dev);
u16 poll_bit = MII_STAT_LINK, status = 0;
unsigned long timeout = jiffies + 5 * HZ;
int phy_addr;
@@ -576,7 +578,7 @@
if (netif_msg_probe(sis_priv))
printk(KERN_DEBUG "%s: MII at address %d"
" not accessible\n",
-   net_dev->name, phy_addr);
+   dev_name, phy_addr);
continue;
}

@@ -609,7 +611,7 @@
(mii_status & (MII_STAT_CAN_TX_FDX 
| MII_STAT_CAN_TX)) ? LAN : HOME;
printk(KERN_INFO "%s: %s transceiver found "
"at address %d.\n",
-  

Page Fault Scalability patch V19 [2/4]: Abstract mm_struct counter operations

2005-03-09 Thread Christoph Lameter
This patch extracts all the operations on rss into definitions in 
include/linux/sched.h. All rss operations are performed through
the following three macros:

get_mm_counter(mm, member)  -> Obtain the value of a counter
set_mm_counter(mm, member, value)   -> Set the value of a counter
update_mm_counter(mm, member, value)-> Add a value to a counter

The simple definitions provided in this patch result in no change to
to the generated code. 

With this patch it becomes easier to add new counters and it is possible
to redefine the method of counter handling (f.e. the page fault scalability
patches may want to use atomic operations or split rss).

Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

Index: linux-2.6.10/include/linux/sched.h
===
--- linux-2.6.10.orig/include/linux/sched.h 2005-02-24 19:41:49.0 
-0800
+++ linux-2.6.10/include/linux/sched.h  2005-02-24 19:42:17.0 -0800
@@ -203,6 +203,10 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct vm_area_struct *area);
 extern void arch_unmap_area_topdown(struct vm_area_struct *area);
 
+#define set_mm_counter(mm, member, value) (mm)->member = (value)
+#define get_mm_counter(mm, member) ((mm)->member)
+#define update_mm_counter(mm, member, value) (mm)->member += (value)
+#define MM_COUNTER_T unsigned long
 
 struct mm_struct {
struct vm_area_struct * mmap;   /* list of VMAs */
@@ -219,7 +223,7 @@ struct mm_struct {
atomic_t mm_count;  /* How many references to 
"struct mm_struct" (users count as 1) */
int map_count;  /* number of VMAs */
struct rw_semaphore mmap_sem;
-   spinlock_t page_table_lock; /* Protects page tables, 
mm->rss, mm->anon_rss */
+   spinlock_t page_table_lock; /* Protects page tables and 
some counters */
 
struct list_head mmlist;/* List of maybe swapped mm's.  
These are globally strung
 * together off init_mm.mmlist, 
and are protected
@@ -229,9 +233,13 @@ struct mm_struct {
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
-   unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm;
+   unsigned long total_vm, locked_vm, shared_vm;
unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
 
+   /* Special counters protected by the page_table_lock */
+   MM_COUNTER_T rss;
+   MM_COUNTER_T anon_rss;
+
unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
 
unsigned dumpable:1;
Index: linux-2.6.10/mm/memory.c
===
--- linux-2.6.10.orig/mm/memory.c   2005-02-24 19:42:12.0 -0800
+++ linux-2.6.10/mm/memory.c2005-02-24 19:42:17.0 -0800
@@ -313,9 +313,9 @@ copy_one_pte(struct mm_struct *dst_mm,  
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
-   dst_mm->rss++;
+   update_mm_counter(dst_mm, rss, 1);
if (PageAnon(page))
-   dst_mm->anon_rss++;
+   update_mm_counter(dst_mm, anon_rss, 1);
set_pte(dst_pte, pte);
page_dup_rmap(page);
 }
@@ -517,7 +517,7 @@ static void zap_pte_range(struct mmu_gat
if (pte_dirty(pte))
set_page_dirty(page);
if (PageAnon(page))
-   tlb->mm->anon_rss--;
+   update_mm_counter(tlb->mm, anon_rss, -1);
else if (pte_young(pte))
mark_page_accessed(page);
tlb->freed++;
@@ -1340,13 +1340,14 @@ static int do_wp_page(struct mm_struct *
spin_lock(>page_table_lock);
page_table = pte_offset_map(pmd, address);
if (likely(pte_same(*page_table, pte))) {
-   if (PageAnon(old_page))
-   mm->anon_rss--;
+   if (PageAnon(old_page)) 
+   update_mm_counter(mm, anon_rss, -1);
if (PageReserved(old_page)) {
-   ++mm->rss;
+   update_mm_counter(mm, rss, 1);
acct_update_integrals();
update_mem_hiwater();
} else
+
page_remove_rmap(old_page);
break_cow(vma, new_page, address, page_table);
lru_cache_add_active(new_page);
@@ -1750,7 +1751,7 @@ static int do_swap_page(struct mm_struct
if (vm_swap_full())
remove_exclusive_swap_page(page);
 
-   mm->rss++;
+   update_mm_counter(mm, rss, 1);
acct_update_integrals();
update_mem_hiwater();
 
@@ 

Re: RFD: Kernel release numbering

2005-03-09 Thread szonyi calin

--- Lee Revell <[EMAIL PROTECTED]> wrote:
> On Wed, 2005-03-09 at 00:25 +0100, szonyi calin wrote:
> >  --- Dave Jones <[EMAIL PROTECTED]> a écrit : 
> > Taking into account that nobody responded on lkml nor
> > on alsa (the message was awaiting modderator aprouval 
> > on alsa-devel) i don't think i will send more bug reports 
> > to alsa. 
> 
> How long ago was this?  alsa-devel has been accepting messages
> from
> non-subscribers for at least 6 months.
> 

It's from 10th of september 2004
I managed to find it on the internet:
http://www.ussg.iu.edu/hypermail/linux/kernel/0409.1/0996.html

The same problem appears in 2.6.11
I'll repost the bug report asap

> Lee
> 
> 
> 

--
A mouse is a device used to point at 
the xterm you want to type in.
Kim Alm on a.s.r.






Découvrez le nouveau Yahoo! Mail : 250 Mo d'espace de stockage pour vos mails ! 
Créez votre Yahoo! Mail sur http://fr.mail.yahoo.com/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Page Fault Scalability patch V19 [3/4]: Drop use of page_table_lock in handle_mm_fault

2005-03-09 Thread Christoph Lameter
The page fault handler attempts to use the page_table_lock only for short
time periods. It repeatedly drops and reacquires the lock. When the lock
is reacquired, checks are made if the underlying pte has changed before
replacing the pte value. These locations are a good fit for the use of
ptep_cmpxchg.

The following patch allows to remove the first time the page_table_lock is
acquired and uses atomic operations on the page table instead. A section
using atomic pte operations is begun with

page_table_atomic_start(struct mm_struct *)

and ends with

page_table_atomic_stop(struct mm_struct *)

Both of these become spin_lock(page_table_lock) and
spin_unlock(page_table_lock) if atomic page table operations are not 
configured (CONFIG_ATOMIC_TABLE_OPS undefined).

The atomic operations with pte_xchg and pte_cmpxchg only work for the lowest
layer of the page table. Higher layers may also be populated in an atomic
way by defining pmd_test_and_populate() etc. The generic versions of these
functions fall back to the page_table_lock (populating higher level page
table entries is rare and therefore this is not likely to be performance
critical). For ia64 the definition of higher level atomic operations is
included.

This patch depends on the pte_cmpxchg patch to be applied first and will
only remove the first use of the page_table_lock in the page fault handler.
This will allow the following page table operations without acquiring
the page_table_lock:

1. Updating of access bits (handle_mm_faults)
2. Anonymous read faults (do_anonymous_page)

The page_table_lock is still acquired for creating a new pte for an anonymous
write fault and therefore the problems with rss that were addressed by splitting
rss into the task structure do not yet occur.

The patch also adds some diagnostic features by counting the number of cmpxchg
failures (useful for verification if this patch works right) and the number of 
patches
received that led to no change in the page table. Statistics may be viewed via
/proc/meminfo

Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

Index: linux-2.6.11/mm/memory.c
===
--- linux-2.6.11.orig/mm/memory.c   2005-03-04 08:25:22.0 -0800
+++ linux-2.6.11/mm/memory.c2005-03-04 12:10:18.0 -0800
@@ -36,6 +36,8 @@
  * ([EMAIL PROTECTED])
  *
  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ * Jan 2005Scalability improvement by reducing the use and the length of 
time
+ * the page table lock is held (Christoph Lameter)
  */
 
 #include 
@@ -1687,8 +1689,7 @@ void swapin_readahead(swp_entry_t entry,
 }
 
 /*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore and have started atomic pte operations
  */
 static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
@@ -1700,15 +1701,14 @@ static int do_swap_page(struct mm_struct
int ret = VM_FAULT_MINOR;
 
pte_unmap(page_table);
-   spin_unlock(>page_table_lock);
+   page_table_atomic_stop(mm);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
-* Back out if somebody else faulted in this pte while
-* we released the page table lock.
+* Back out if somebody else faulted in this pte
 */
spin_lock(>page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1731,8 +1731,7 @@ static int do_swap_page(struct mm_struct
lock_page(page);
 
/*
-* Back out if somebody else faulted in this pte while we
-* released the page table lock.
+* Back out if somebody else faulted in this pte
 */
spin_lock(>page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1782,63 +1781,76 @@ out:
 }
 
 /*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs. 
+ * We are called with the MM semaphore held and atomic pte operations started.
  */
 static int
 do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table, pmd_t *pmd, int write_access,
-   unsigned long addr)
+   unsigned long addr, pte_t orig_entry)
 {
pte_t entry;
-   struct page * page = ZERO_PAGE(addr);
+   struct page * page;
 
-   /* Read-only mapping of ZERO_PAGE. */
-   entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+   if (unlikely(!write_access)) {
 
-   /* ..except if it's a write access */
-   if 

Page Fault Scalability patch V19 [1/4]: pte_cmpxchg and CONFIG_ATOMIC_TABLE_OPS

2005-03-09 Thread Christoph Lameter
The current way of updating ptes in the Linux vm includes first clearing
a pte before setting it to another value. The clearing is performed while
holding the page_table_lock to insure that the entry will not be modified
by the CPU directly (clearing the pte clears the present bit),
by an arch specific interrupt handler or another page fault handler
running on another CPU. This approach is necessary for some
architectures that cannot perform atomic updates of page table entries.

If a page table entry is cleared then a second CPU may generate a page fault
for that entry. The fault handler on the second CPU will then attempt to
acquire the page_table_lock and wait until the first CPU has completed
updating the page table entry. The fault handler on the second CPU will then
discover that everything is ok and simply do nothing (apart from incrementing
the counters for a minor fault and marking the page again as accessed).

However, most architectures actually support atomic operations on page
table entries. The use of atomic operations on page table entries would
allow the update of a page table entry in a single atomic operation instead
of writing to the page table entry twice. There would also be no danger of
generating a spurious page fault on other CPUs.

The following patch introduces two new atomic operations ptep_xchg and
ptep_cmpxchg that may be provided by an architecture. The fallback in
include/asm-generic/pgtable.h is to simulate both operations through the
existing ptep_get_and_clear function. So there is essentially no change if
atomic operations on ptes have not been defined. Architectures that do
not support atomic operations on ptes may continue to use the clearing of
a pte for locking type purposes.

Atomic operations may be enabled in the kernel configuration on
i386, ia64 and x86_64 if a suitable CPU is configured in SMP mode.
Generic atomic definitions for ptep_xchg and ptep_cmpxchg
have been provided based on the existing xchg() and cmpxchg() functions
that already work atomically on many platforms. It is very
easy to implement this for any architecture by adding the appropriate
definitions to arch/xx/Kconfig.

The provided generic atomic functions may be overridden as usual by defining
the appropriate__HAVE_ARCH_xxx constant and providing an implementation.

My aim to reduce the use of the page_table_lock in the page fault handler
rely on a pte never being clear if the pte is in use even when the
page_table_lock is not held. Clearing a pte before setting it to another
values could result in a situation in which a fault generated by
another cpu could install a pte which is then immediately overwritten by
the first CPU setting the pte to a valid value again. This patch is
important for future work on reducing the use of spinlocks in the vm. 

Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

Index: linux-2.6.10/mm/rmap.c
===
--- linux-2.6.10.orig/mm/rmap.c 2005-02-24 19:41:50.0 -0800
+++ linux-2.6.10/mm/rmap.c  2005-02-24 19:42:12.0 -0800
@@ -575,11 +575,6 @@ static int try_to_unmap_one(struct page 
 
/* Nuke the page table entry. */
flush_cache_page(vma, address);
-   pteval = ptep_clear_flush(vma, address, pte);
-
-   /* Move the dirty bit to the physical page now the pte is gone. */
-   if (pte_dirty(pteval))
-   set_page_dirty(page);
 
if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
@@ -594,11 +589,15 @@ static int try_to_unmap_one(struct page 
list_add(>mmlist, _mm.mmlist);
spin_unlock(_lock);
}
-   set_pte(pte, swp_entry_to_pte(entry));
+   pteval = ptep_xchg_flush(vma, address, pte, 
swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
mm->anon_rss--;
-   }
+   } else
+   pteval = ptep_clear_flush(vma, address, pte);
 
+   /* Move the dirty bit to the physical page now that the pte is gone. */
+   if (pte_dirty(pteval))
+   set_page_dirty(page);
mm->rss--;
acct_update_integrals();
page_remove_rmap(page);
@@ -691,15 +690,15 @@ static void try_to_unmap_cluster(unsigne
if (ptep_clear_flush_young(vma, address, pte))
continue;
 
-   /* Nuke the page table entry. */
flush_cache_page(vma, address);
-   pteval = ptep_clear_flush(vma, address, pte);
 
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
-   set_pte(pte, pgoff_to_pte(page->index));
+   pteval = ptep_xchg_flush(vma, address, pte, 
pgoff_to_pte(page->index));
+   else
+   pteval = ptep_clear_flush(vma, address, pte);
 
-   /* Move 

Page Fault Scalability patch V19 [4/4]: Drop use of page_table_lock in do_anonymous_page

2005-03-09 Thread Christoph Lameter
Do not use the page_table_lock in do_anonymous_page. This will significantly
increase the parallelism in the page fault handler in SMP systems. The patch
also modifies the definitions of _mm_counter functions so that rss and anon_rss
become atomic.

Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]>

Index: linux-2.6.11/mm/memory.c
===
--- linux-2.6.11.orig/mm/memory.c   2005-03-09 10:43:28.0 -0800
+++ linux-2.6.11/mm/memory.c2005-03-09 10:43:29.0 -0800
@@ -1825,12 +1825,12 @@ do_anonymous_page(struct mm_struct *mm, 
 vma->vm_page_prot)),
  vma);
 
-   spin_lock(>page_table_lock);
+   page_table_atomic_start(mm);

if (!ptep_cmpxchg(page_table, orig_entry, entry)) {
pte_unmap(page_table);
page_cache_release(page);
-   spin_unlock(>page_table_lock);
+   page_table_atomic_stop(mm);
inc_page_state(cmpxchg_fail_anon_write);
return VM_FAULT_MINOR;
}
@@ -1848,7 +1848,7 @@ do_anonymous_page(struct mm_struct *mm, 
SetPageReferenced(page);
update_mmu_cache(vma, addr, entry); 
pte_unmap(page_table);
-   spin_unlock(>page_table_lock);
+   page_table_atomic_stop(mm);
 
return VM_FAULT_MINOR;
 }
Index: linux-2.6.11/include/linux/sched.h
===
--- linux-2.6.11.orig/include/linux/sched.h 2005-03-09 10:43:26.0 
-0800
+++ linux-2.6.11/include/linux/sched.h  2005-03-09 10:43:29.0 -0800
@@ -203,10 +203,26 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct vm_area_struct *area);
 extern void arch_unmap_area_topdown(struct vm_area_struct *area);
 
+#ifdef CONFIG_ATOMIC_TABLE_OPS
+/*
+ * Atomic page table operations require that the counters are also
+ * incremented atomically
+*/
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->member))
+#define update_mm_counter(mm, member, value) atomic_add(value, &(mm)->member)
+#define MM_COUNTER_T atomic_t
+
+#else
+/*
+ * No atomic page table operations. Counters are protected by
+ * the page table lock 
+ */
 #define set_mm_counter(mm, member, value) (mm)->member = (value)
 #define get_mm_counter(mm, member) ((mm)->member)
 #define update_mm_counter(mm, member, value) (mm)->member += (value)
 #define MM_COUNTER_T unsigned long
+#endif
 
 struct mm_struct {
struct vm_area_struct * mmap;   /* list of VMAs */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] 2.6.10 - direct-io async short read bug

2005-03-09 Thread Andrew Morton
Suparna Bhattacharya <[EMAIL PROTECTED]> wrote:
>
>  >Solaris, which does forcedirectio as a mount option, actually
>  > will do buffered I/O on the trailing part.  Consider it like a bounce
>  > buffer.  That way they don't DMA the trailing data and succeed the I/O.
>  > The I/O returns actual bytes till EOF, just like read(2) is supposed to.
>  >Either this or a fully DMA'd number 4 is really what we should
>  > do.  If security can only be solved via a bounce buffer, who cares?  If
>  > the user created themselves a non-aligned file to open O_DIRECT, that's
>  > their problem if the last part-sector is negligably slower.
> 
>  If writes/truncates take care of zeroing out the rest of the sector
>  on disk, might we still be OK without having to do the bounce buffer
>  thing ?

We can probably rely on the rest of the sector outside i_size being zeroed
anyway.  Because if it contains non-zero gunk then the fs already has a
problem, and the user can get at that gunk with an expanding truncate and
mmap() anyway.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] -stable, how it's going to work.

2005-03-09 Thread Greg KH
On Wed, Mar 09, 2005 at 08:39:36PM +0100, Andi Kleen wrote:
> On Wed, Mar 09, 2005 at 10:34:08AM -0800, Greg KH wrote:
> > On Wed, Mar 09, 2005 at 10:56:33AM +0100, Andi Kleen wrote:
> > > Greg KH <[EMAIL PROTECTED]> writes:
> > > >
> > > > Rules on what kind of patches are accepted, and what ones are not, into
> > > > the "-stable" tree:
> > > >  - It must be obviously correct and tested.
> > > >  - It can not bigger than 100 lines, with context.
> > > 
> > > This rule seems silly. What happens when a security fix needs 150 lines? 
> > 
> > Then we bend the rules and accept it :)
> > 
> > We'll take these as a case-by-case basis...
> > 
> > > >  - Security patches will be accepted into the -stable tree directly from
> > > >the security kernel team, and not go through the normal review cycle.
> > > >Contact the kernel security team for more details on this procedure.
> > > 
> > > This also sounds like a bad rule. How come the security team has more
> > > competence to review patches than the subsystem maintainers?  I can
> > > see the point of overruling maintainers on security issues when they
> > > are not responsive, but if they are I think the should be still the
> > > main point of contact.
> > 
> > Security fixes go from the security team to Linus's tree directly, and
> > usually the subsystem maintainer has already been notified and has
> > reviewedit.  At that point in time, they are public and accepted into
> 
> What guarantees that?

The kernel security team's proceedures.

> Basically what I would like to avoid is that the security team
> merges something through the backdoor that the maintainer considers crap.
> 
> If anything you should have a rule like
> 
> "Send to maintainer. If he doesn't ACK in 24h send it directly"
> 
> 
> > mainline, and need to be made availble to the -stable users as soon as
> > possible.
> > 
> > That is why the "fast track" is going to happen, the patch really was
> > reviewed properly, just not in public :)
> 
> Well, when you really want to have such formal rules (which is a novelty in 
> Linux space BTW, for many years we did fine with unwritten rules)  then you
> should spell it out completely. Or alternatively drop all the formal
> rules and do it informally like it was always done.

I'd love to do it informally, but the rules are going to be used to make
our lives easier, by having something to point to when we want to reject
something, and having something that everyone can refer to when trying
to understand what it is we are attempting to do here.

If they get too complex, or large, we will have to revisit them.

So, let's stop arguing about the semantics of the rules, and see if what
we have proposed actually works in real-life.  If that doesn't work out,
we can revisit it then.

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] VGA arbitration: draft of kernel side

2005-03-09 Thread Kronos
Il Wed, Mar 09, 2005 at 09:46:20AM +1100, Benjamin Herrenschmidt ha scritto: 
> One thing is: I
> don't have x86 hardware, or at least, nothing where I can have 2 VGA
> cards in (I may have access to an old laptop). So I'll need help &
> testers at one point.

It's your lucky day ;) I've just assembled a PC with 2 PCI video card (S3
something and a Matrox Mystique) and I think that I've an old ISA video
card somewhere (if it can be usefull).
Feel free to put me on CC when you have something to test.

Luca
-- 
Home: http://kronoz.cjb.net
Colui che sorride quando le cose vanno male ha pensato a qualcuno a cui
dare la colpa.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Log full of "ing_filter: fixed ippp2 out ippp2"

2005-03-09 Thread Kimmo Sundqvist
Hello

Please cc all replies to me.

After upgrading my little NATting firewall/router from 2.6.7-ck4 to 
2.6.10-gentoo-r6 my /var/log/messages is 15MB in size and most of it looks 
like the text below.  All traffic to the Internet seems to cause this.  

"cat /var/log/messages | uniq | uniqmessages" results in a 3MB file.  I use 
syslog-ng.

Iptables setup script further down.

Mar  9 21:58:15 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:15 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:15 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:15 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:15 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:16 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2
Mar  9 21:58:17 shadowgate ing_filter:  fixed  ippp2 out ippp2

My firewall setup looks like:

IPTABLES=/sbin/iptables
MODPROBE=/sbin/modprobe
DEPMOD=/sbin/depmod

EXTIF="ippp2"
INTIF="eth0"
KAKKOSIF="eth1"

$DEPMOD -a

$MODPROBE ip_tables

#$MODPROBE ip_conntrack
#In the kernel

$MODPROBE ip_conntrack_ftp
$MODPROBE iptable_nat
$MODPROBE ip_nat_ftp

echo "1" > /proc/sys/net/ipv4/ip_forward
echo "1" > /proc/sys/net/ipv4/ip_dynaddr
#echo "1" > /proc/sys/net/ipv4/conf/all/proxy_arp

$IPTABLES -P INPUT ACCEPT
$IPTABLES -F INPUT
$IPTABLES -P OUTPUT ACCEPT
$IPTABLES -F OUTPUT
$IPTABLES -P FORWARD DROP
$IPTABLES -F FORWARD

$IPTABLES -t nat -F

/sbin/iptables -A FORWARD -i eth1 -o eth0 -j ACCEPT
/sbin/iptables -A FORWARD -i eth0 -o eth1 -j ACCEPT

$IPTABLES -A FORWARD -i $EXTIF -o $KAKKOSIF -m state --state \
ESTABLISHED,RELATED -j ACCEPT   # RJ-45

$IPTABLES -A FORWARD -i $EXTIF -o $INTIF -m state --state \
ESTABLISHED,RELATED  -j ACCEPT # BNC segment

$IPTABLES -A FORWARD -i $KAKKOSIF -o $EXTIF -j ACCEPT   # RJ-45
$IPTABLES -A FORWARD -i $INTIF -o $EXTIF -j ACCEPT# BNC segment
$IPTABLES -t nat -A POSTROUTING -o $EXTIF -j MASQUERADE
$IPTABLES -A FORWARD -j LOG
exit 0

-Kimmo S.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Direct io on block device has performance regression on 2.6.x kernel

2005-03-09 Thread Andrew Morton
"Chen, Kenneth W" <[EMAIL PROTECTED]> wrote:
>
> Andrew Morton wrote on Tuesday, March 08, 2005 10:28 PM
> > But before doing anything else, please bench this on real hardware,
> > see if it is worth pursuing.
> 
> Let me answer the questions in reverse order.  We started with running
> industry standard transaction processing database benchmark on 2.6 kernel,
> on real hardware (4P smp, 64 GB memory, 450 disks) running industry
> standard db application.  What we measured is that with best tuning done
> to the system, 2.6 kernel has a huge performance regression relative to
> its predecessor 2.4 kernel (a kernel from RHEL3, 2.4.21 based).

That's news to me.  I thought we were doing OK with big database stuff. 
Surely lots of people have been testing such things.

> And yes, it is all worth pursuing, the two patches on raw device recuperate
> 1/3 of the total benchmark performance regression.

On a real disk driver?  hm, I'm wrong then.

Did you generate a kernel profile?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ patch 6/7] drivers/serial/jsm: new serial device driver

2005-03-09 Thread Greg KH
On Wed, Mar 09, 2005 at 01:35:41PM -0600, Kilau, Scott wrote:
> As it stands today, your requirement appears to be that she needs
> to yank all diags ioctls and sysfs files before the driver can make
> it into the kernel sources.

Not all sysfs files, sysfs files are fine, as long as they are
implemented properly, and are there for things that "make sense".

But yes, it should would be easier to accept the driver if the ioctls
were not there :)

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: p5ad2 motherboard sensors work

2005-03-09 Thread Jean Delvare
Hi Christopher,

> Your patch allows "finally" for me to be able to monitor all my temps
> in  my computer with an offset of 6. http://lkml.org/lkml/2005/2/26/65

Thanks for the report! I'm glad to learn that someone actually used my
w83627ehf driver :)

Can you elaborate of the "offset of 6"?

> Thanks very much let me know if you need any other information.

Did you only test temperatures or do you also have fan speed readings?

Thanks,
-- 
Jean Delvare
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: PATCH: 2.6.11-ac2

2005-03-09 Thread Jason Lunz
[EMAIL PROTECTED] said:
> You know what would be really useful... if www.kernel.org listed the
> "latest -ac" patch as something current instead of 2.6.10-ac12, which was
> a great patch in its day, but hasn't been current for a while.
>
> In fairness, the -mm is out of date, too. Perhaps a bit of automation
> would be appropriate here, so that no one would have to update this
> manually.

I could have sworn it showed 2.6.11-ac1 for a while. Maybe the 2.6.11.2
stuff broke it somehow?

Jason

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.11-mm2 vs audio for kino and tvtime

2005-03-09 Thread Jean Delvare
Hi Gene, Andrew, all,

(Gene, note that I cannot write to you directly because Verizon are
idiots. Let's just hope you'll read that.)

[Gene Heskett]
> /usr/pcHDTV3000/linux/pcHDTV-1.6/kernel-2.6.x/driver/bttv-i2c.c:362:
> error: unknown field `id' specified in initializer

I've dropped the "id" member of struct i2c_client, as it were useless.
Third-party driver authors now need to do the same.

Patches to pcHDTV 1.6 and 2.0 attached (untested). Feel free to push the
latter to the author of hdPCTV. Note that the removed struct member was
really not used before, so the driver will still work with earlier
kernels.

[Andrew Morton]
> What's pcHDTV-1.6.tar.gz?  If it was merged up then these things
> wouldn't happen.

I second that, especially since the pcHDTV package is made up of
modified bttv and cx88 drivers, not an original driver. Merging the
changes into the kernel would obviously make everyone's life easier.

As a side note, I have (many) other changes to the i2c subystem in my
plans, some of them are rather intrusive, so expect pcHDTV to break
again soon, unless it gets merged until then.

[Gene Heskett]
> Third, somewhere between 2.6.11-rc5-RT-V0.39-02 and 2.6.11, I've
> lost my sensors except for one on the motherboard called THRM by
> gkrellm-2.28.  Nothing seems to be able to bring the w83627hf back
> to life.

THRM is most likely a temperature you get from /proc/acpi/thermal_zone,
and isn't related with the w83627hf driver.

I think that you are affected by recent changes made by the ACPI folks
to the way resources are reserved. See bug #4014:
  http://bugzilla.kernel.org/show_bug.cgi?id=4014

You can check /proc/ioports on the working system after loading
w83627hf, and compare with /proc/ioports on the non-working system. I'd
expect you to find that the non-working system has reserved a subrange
of what the w83627hf driver attempts to grab, making it fail.

-- 
Jean Delvare
diff -u -rN pcHDTV-1.6/kernel-2.6.x/driver.orig/bttv-i2c.c 
pcHDTV-1.6/kernel-2.6.x/driver/bttv-i2c.c
--- pcHDTV-1.6/kernel-2.6.x/driver.orig/bttv-i2c.c  Fri Dec 10 19:42:38 2004
+++ pcHDTV-1.6/kernel-2.6.x/driver/bttv-i2c.c   Wed Mar  9 13:52:24 2005
@@ -359,7 +359,6 @@
 
 static struct i2c_client bttv_i2c_client_template = {
I2C_DEVNAME("bttv internal"),
-.id   = -1,
 };
 
 
diff -u -rN pcHDTV-1.6/kernel-2.6.x/driver.orig/cx88-i2c.c 
pcHDTV-1.6/kernel-2.6.x/driver/cx88-i2c.c
--- pcHDTV-1.6/kernel-2.6.x/driver.orig/cx88-i2c.c  Fri Dec 10 19:42:39 2004
+++ pcHDTV-1.6/kernel-2.6.x/driver/cx88-i2c.c   Wed Mar  9 13:51:19 2005
@@ -136,7 +136,6 @@
 
 static struct i2c_client cx8800_i2c_client_template = {
 I2C_DEVNAME("cx88xx internal"),
-.id   = -1,
 };
 
 /* init + register i2c algo-bit adapter */
diff -u -rN pcHDTV-2.0.orig/bttv-i2c.c pcHDTV-2.0/bttv-i2c.c
--- pcHDTV-2.0.orig/bttv-i2c.c  Fri Feb 18 21:54:35 2005
+++ pcHDTV-2.0/bttv-i2c.c   Wed Mar  9 13:56:34 2005
@@ -317,7 +317,6 @@
 
 static struct i2c_client bttv_i2c_client_template = {
I2C_DEVNAME("bttv internal"),
-.id   = -1,
 };
 
 
diff -u -rN pcHDTV-2.0.orig/cx88-i2c.c pcHDTV-2.0/cx88-i2c.c
--- pcHDTV-2.0.orig/cx88-i2c.c  Fri Feb 18 21:54:38 2005
+++ pcHDTV-2.0/cx88-i2c.c   Wed Mar  9 13:56:58 2005
@@ -142,7 +142,6 @@
 
 static struct i2c_client cx8800_i2c_client_template = {
 I2C_DEVNAME("cx88xx internal"),
-.id   = -1,
 };
 
 static char *i2c_devs[128] = {
diff -u -rN pcHDTV-2.0.orig/dpl3518.c pcHDTV-2.0/dpl3518.c
--- pcHDTV-2.0.orig/dpl3518.c   Fri Feb 18 21:54:36 2005
+++ pcHDTV-2.0/dpl3518.cWed Mar  9 13:57:11 2005
@@ -374,7 +374,6 @@
 static struct i2c_client client_template =
 {
 I2C_DEVNAME("dpl3518"),
-.id = -1,
 .driver = 
 };
 
diff -u -rN pcHDTV-2.0.orig/saa7134-i2c.c pcHDTV-2.0/saa7134-i2c.c
--- pcHDTV-2.0.orig/saa7134-i2c.c   Fri Feb 18 21:54:36 2005
+++ pcHDTV-2.0/saa7134-i2c.cWed Mar  9 13:57:22 2005
@@ -361,7 +361,6 @@
 
 static struct i2c_client saa7134_client_template = {
I2C_DEVNAME("saa7134 internal"),
-.id= -1,
 };
 
 /* --- */
diff -u -rN pcHDTV-2.0.orig/tda9875.c pcHDTV-2.0/tda9875.c
--- pcHDTV-2.0.orig/tda9875.c   Fri Feb 18 21:54:38 2005
+++ pcHDTV-2.0/tda9875.cWed Mar  9 13:57:42 2005
@@ -418,7 +418,6 @@
 static struct i2c_client client_template =
 {
 I2C_DEVNAME("tda9875"),
-.id= -1,
 .driver= ,
 };
 


Linux 2.4.30-pre3

2005-03-09 Thread Marcelo Tosatti
Hi,

Here goes the third pre of v2.4.30.


It contains a small number of scattered fixes, most notably e1000 update, 
a backport of v2.6's nForce override fix, and SATA update. 

The changes which broke "tar --verify" on tapes have been reverted.

Please read the changelog for more details.


Summary of changes from v2.4.30-pre2 to v2.4.30-pre3


:
  o [SPARC64]: Tomatillo PCI controller bug fixes
  o [TIGON3]: Do not touch NIC_SRAM_FIRMWARE_MBOX when TG3_FLG2_SUN_570X
  o [TIGON3]: Update driver version and reldate

:
  o BUG on error handlings in Ext3 under I/O

:
  o [SPARC]: DBRI fixes and improvements

:
  o sata_qstor: eh_timeout fix

:
  o e1000: 1 Robert Olsson's fix and
  o e1000: 2 use netif_poll_{enable|disable}
  o e1000: Avoid race between e1000_watchdog
  o e1000: Delay clean-up of last Tx buffer
  o e1000: Fix WOL settings in 82544 based
  o e1000: Patch from Peter Kjellstroem --
  o e1000: Checks for desc ring/rx data
  o e1000: Report failure code when loopback
  o e1000: Fixes related to Cable length
  o e1000: Driver version white space,

:
  o [libata] add ->bmdma_{stop,status} hooks

:
  o sk98lin workaround Asus K8V SE Deluxe buggy firmware

:
  o Fix units/partition count in sd.c

Adrian Bunk:
  o drivers/scsi/sata_*: make code static

David S. Miller:
  o [SPARC64]: Fix 32bit compat layer bugs in sys_ipc() and 
sys_rt_sigtimedwait()
  o [AF_UNIX]: Fix SIOCINQ for STREAM
  o [SPARC64]: Accept 'm5823' clock chip as seen on SB1500

Jeff Garzik:
  o [libata sata_via] minor cleanups
  o [libata sata_via] add support for VT6421 SATA
  o [libata] resync with 2.6 msleep() updates
  o [libata] trivial: whitespace sync with 2.6
  o [libata] do not call pci_disable_device() for certain errors
  o [libata] Add missing hooks, to avoid oops in advanced SATA drivers
  o [libata] Use DMA_{32,64}BIT_MASK in ahci, sata_vsc drivers
  o [libata ahci] Print out port id on error messages
  o [libata] remove_one helper cleanup

John W. Linville:
  o libata: fix command queue leak when xlat_func fails
  o tulip: make tulip_stop_rxtx() wait for DMA to fully stop

Marcelo Tosatti:
  o Cset exclude: [EMAIL PROTECTED]|ChangeSet|20041125155150|65356
  o Allow lseek on SCSI tapes
  o Allow lseek on osst to keep tar --verify happy
  o Change VERSION to 2.4.30-pre3
  o Early ACPI PCI quirk depends on CONFIG_X86_IO_APIC

Mark Lord:
  o sata_qstor: new basic driver for Pacific Digital
  o [libata qstor] minor update per LKML comments

Matt Domsch:
  o aic7xxx: don't reset chip on pause

Mikael Pettersson:
  o fix undefined behaviour in cistpl.c

Paul Fulghum:
  o fix synclinkmp register access typo

Solar Designer:
  o Fix for swapoff after re-creating device files
  o Fix proc_tty.c comment typos

Zwane Mwaikambo:
  o Fix timer override on nforce

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rwsem: Make rwsems use interrupt disabling spinlocks

2005-03-09 Thread Andrew Morton
Badari Pulavarty <[EMAIL PROTECTED]> wrote:
>
>  I am not sure if this is related to your patch. But I ran into
>  BUG() with sysrq-t with your patch.
> 
>  Thanks,
>  Badari
> 
>  BUG: soft lockup detected on CPU#1!
>  
>  Modules linked in: joydev sg st floppy usbserial parport_pc lp parport
>  ipv6 ohci_hcd i2c_amd756 i2c_core evdev usbcore raid0 dm_mod nls_utf8
>  Pid: 15433, comm: bash Not tainted 2.6.11-mm1n
>  RIP: 0010:[] {__do_softirq+84}
>  RSP: 0018:8101dff83f68  EFLAGS: 0206
>  RAX: 80651880 RBX: 0002 RCX: 0004
>  RDX: 0002 RSI: 0103 RDI: 8101d7c77680
>  RBP: 810177ffbe48 R08: 0002 R09: 0100
>  R10: 0001 R11:  R12: 0001
>  R13: 2aafb000 R14: 000a R15: 0001
>  FS:  2b2890a0() GS:80651880()
>  knlGS:
>  CS:  0010 DS:  ES:  CR0: 8005003b
>  CR2: 2aafb000 CR3: 0001bb2a CR4: 06e0
>
>  Call Trace: {do_softirq+53}
>  {apic_timer_interrupt+133}
>   {_spin_unlock_irqrestore+5}
> {write_sysrq_trigger+55}
>  {vfs_write+233}
> {sys_write+83}
>  {system_call+126}

That's probably just a false positive in Ingo's soft-lockup detector.  Long
streams of irq-context serial console output will do that.

Ingo, we already have a touch_nmi_watchdog() in the sysrq code.  It might be
worth adding a touch_softlockup_watchdog() wherever we have a
touch_nmi_watchdog().
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] keys: Discard key spinlock and use RCU for key payload [try #2]

2005-03-09 Thread David Howells

The attached patch changes the key implementation in a number of ways:

 (1) It removes the spinlock from the key structure.

 (2) The key flags are now accessed using atomic bitops instead of
 write-locking the key spinlock and using C bitwise operators.

 The three instantiation flags are dealt with with the construction
 semaphore held during the request_key/instantiate/negate sequence, thus
 rendering the spinlock superfluous.

 The key flags are also now bit numbers not bit masks.

 (3) The key payload is now accessed using RCU. This permits the recursive
 keyring search algorithm to be simplified greatly since no locks need be
 taken other than the usual RCU preemption disablement. Searching now does
 not require any locks or semaphores to be held; merely that the starting
 keyring be pinned.

 (4) The keyring payload now includes an RCU head so that it can be disposed
 of by call_rcu(). This requires that the payload be copied on unlink to
 prevent introducing races in copy-down vs search-up.

 (5) The user key payload is now a structure with the data following it. It
 includes an RCU head like the keyring payload and for the same reason. It
 also contains a data length because the data length in the key may be
 changed on another CPU whilst an RCU protected read is in progress on the
 payload. This would then see the supposed RCU payload and the on-key data
 length getting out of sync.

 I'm tempted to drop the key's datalen entirely, except that it's used in
 conjunction with quota management and so is a little tricky to get rid
 of.

I've updated the patch to use |= 1 << N instead of _set_bit() during
initialisation as the latter is generally slower.

Signed-Off-By: David Howells <[EMAIL PROTECTED]>
---
warthog>diffstat -p1 keys-rcu-2611-2.diff 
 Documentation/keys.txt   |5 
 include/linux/key-ui.h   |6 -
 include/linux/key.h  |   25 ++--
 security/keys/key.c  |   95 +++-
 security/keys/keyctl.c   |   23 +--
 security/keys/keyring.c  |  250 ++-
 security/keys/proc.c |   21 ++-
 security/keys/process_keys.c |   16 --
 security/keys/request_key.c  |   32 ++---
 security/keys/user_defined.c |   87 ++
 10 files changed, 316 insertions(+), 244 deletions(-)

diff -uNr linux-2.6.11/Documentation/keys.txt 
linux-2.6.11-keys-rcu/Documentation/keys.txt
--- linux-2.6.11/Documentation/keys.txt 2005-01-04 11:12:42.0 +
+++ linux-2.6.11-keys-rcu/Documentation/keys.txt2005-03-09 
16:28:50.0 +
@@ -606,8 +606,9 @@
 due to two different users opening the same file is left to the filesystem
 author to solve.
 
-When accessing a key's payload data, the key->lock should be at least read
-locked, or else the data may be changed by update during the access.
+When accessing a key's payload data, RCU reading precautions on the payload
+pointer should be taken, or else the data may be changed by update during the
+access.
 
 (*) To search for a key, call:
 
diff -uNr linux-2.6.11/include/linux/key.h 
linux-2.6.11-keys-rcu/include/linux/key.h
--- linux-2.6.11/include/linux/key.h2005-01-04 11:13:54.0 +
+++ linux-2.6.11-keys-rcu/include/linux/key.h   2005-03-09 16:48:34.0 
+
@@ -18,7 +18,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 #ifdef __KERNEL__
@@ -77,7 +77,6 @@
key_serial_tserial; /* key serial number */
struct rb_node  serial_node;
struct key_type *type;  /* type of key */
-   rwlock_tlock;   /* examination vs change lock */
struct rw_semaphore sem;/* change vs change sem */
struct key_user *user;  /* owner of this key */
time_t  expiry; /* time at which key expires 
(or 0) */
@@ -85,14 +84,10 @@
gid_t   gid;
key_perm_t  perm;   /* access permissions */
unsigned short  quotalen;   /* length added to quota */
-   unsigned short  datalen;/* payload data length */
-   unsigned short  flags;  /* status flags (change with 
lock writelocked) */
-#define KEY_FLAG_INSTANTIATED  0x0001  /* set if key has been 
instantiated */
-#define KEY_FLAG_DEAD  0x0002  /* set if key type has been 
deleted */
-#define KEY_FLAG_REVOKED   0x0004  /* set if key had been revoked 
*/
-#define KEY_FLAG_IN_QUOTA  0x0008  /* set if key consumes quota */
-#define KEY_FLAG_USER_CONSTRUCT0x0010  /* set if key is being 
constructed in userspace */
-#define KEY_FLAG_NEGATIVE  0x0020  /* set if key is negative */
+   unsigned short  datalen;/* payload data length
+ 

Re: [RFC] -stable, how it's going to work.

2005-03-09 Thread Andi Kleen
On Wed, Mar 09, 2005 at 10:28:22AM -0800, Chris Wright wrote:
> * Andi Kleen ([EMAIL PROTECTED]) wrote:
> > Greg KH <[EMAIL PROTECTED]> writes:
> > >
> > > Rules on what kind of patches are accepted, and what ones are not, into
> > > the "-stable" tree:
> > >  - It must be obviously correct and tested.
> > >  - It can not bigger than 100 lines, with context.
> > 
> > This rule seems silly. What happens when a security fix needs 150 lines? 
> > 
> > Better maybe a rule like "The patch should be the minimal and safest 
> > change to fix an issue". But see below for an exception.
> 
> It's just a guideline to scope the work.  But a fixed size is probably
> less meaningful than your wording.
> 
> > >  - It must fix only one thing.
> > >  - It must fix a real bug that bothers people (not a, "This could be a
> > >problem..." type thing.)
> > >  - It must fix a problem that causes a build error (but not for things
> > >marked CONFIG_BROKEN), an oops, a hang, data corruption, a real
> > >security issue, or some "oh, that's not good" issue.  In short,
> > >something critical.
> > >  - No "theoretical race condition" issues, unless an explanation of how
> > >the race can be exploited.
> > >  - It can not contain any "trivial" fixes in it (spelling changes,
> > >whitespace cleanups, etc.)
> > >  - It must be accepted by the relevant subsystem maintainer.
> > 
> > >  - It must follow Documentation/SubmittingPatches rules.
> > 
> > One rule I'm missing:
> > 
> > - It must be accepted to mainline. 
> 
> This can violate the principle of keeping fixes simple for -stable tree.
> And Linus/Andrew don't want to litter mainline with patch series that
> do simple fix followed by complete fix meant for developement branch.

But it risks code drift like we had in 2.4 with older kernels 
having more fixes than the newer kernel. And that way lies madness.

I think it is very very important to avoid this.

If you prefer you can rewrite the rule like

"Fix must in mainline first. In exceptional cases when the fix 
in mainline is too intrusive or risky a simpler version of the patch
can be applied to stable. In this case the mainline fix must be already
accepted. For most cases the full fix should be applied to avoid code drift"


> I agree, it's a good rule, but these should be small, temporal diffs
> from mainline.  For example, -ac tree will sometimes do the simpler fix,
> whereas mainline does proper complete fix.

You make it sound like all patches are super complicated and 
not suitable for backporting.

>From my experiences maintaining distribution kernel most mainline changes
can be just completely backported. 

> 
> > >the security kernel team, and not go through the normal review cycle.
> > >Contact the kernel security team for more details on this procedure.
> > 
> > This also sounds like a bad rule. How come the security team has more
> > competence to review patches than the subsystem maintainers?  I can
> > see the point of overruling maintainers on security issues when they
> > are not responsive, but if they are I think the should be still the
> > main point of contact.
> 
> They don't, the security patches should still be reviewed by subsystem
> maintainer.  Point here is, sometimes there's disclosure coordination
> happening as well.

Ok, how does it coordinate with the vendor-sec process? 
And at what point is the subsystem maintainer notified.

The security thing seems to be still quite half backed to me...

-Andi

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.11 on AMD64 traps

2005-03-09 Thread Patrick McHardy
Michal Vanco wrote:
I see this problem running 2.6.11 on dual AMD64:
Running quagga routing daemon (ospf+bgp) and issuing "netstat -rn |wc 
-l" command
while quagga tries to load more than 154000 routes from its bgp 
neighbours causes this trap:
This patch should fix it. The crash is caused by stale pointers,
the pointers in fib_iter_state are not reloaded after seq->stop()
followed by seq->start(pos > 0).
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/03/09 20:41:46+01:00 [EMAIL PROTECTED] 
#   [IPV4]: Fix crash while reading /proc/net/route caused by stale pointers
#   
#   Signed-off-by: Patrick McHardy <[EMAIL PROTECTED]>
# 
# net/ipv4/fib_hash.c
#   2005/03/09 20:41:37+01:00 [EMAIL PROTECTED] +11 -1
#   [IPV4]: Fix crash while reading /proc/net/route caused by stale pointers
#   
#   Signed-off-by: Patrick McHardy <[EMAIL PROTECTED]>
# 
diff -Nru a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
--- a/net/ipv4/fib_hash.c   2005-03-09 20:43:55 +01:00
+++ b/net/ipv4/fib_hash.c   2005-03-09 20:43:55 +01:00
@@ -919,13 +919,23 @@
return fa;
 }
 
+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
+{
+   struct fib_alias *fa = fib_get_first(seq);
+
+   if (fa)
+   while (pos && (fa = fib_get_next(seq)))
+   --pos;
+   return pos ? NULL : fa;
+}
+
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
 {
void *v = NULL;
 
read_lock(_hash_lock);
if (ip_fib_main_table)
-   v = *pos ? fib_get_next(seq) : SEQ_START_TOKEN;
+   v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v;
 }
 


Re: [patch 1/1] x86-64: forgot asmlinkage on sys_mmap

2005-03-09 Thread Blaisorblade
On Wednesday 09 March 2005 20:34, Andi Kleen wrote:
> On Wed, Mar 09, 2005 at 07:24:00PM +0100, Blaisorblade wrote:
> > On Wednesday 09 March 2005 18:24, Andi Kleen wrote:
> > > [EMAIL PROTECTED] writes:
> > > > CC: Andi Kleen <[EMAIL PROTECTED]>
> > > >
> > > > I think it should be there, please check better.
> > >
> > > It doesn't matter. asmlinkage is a nop on x86-64.
> >
> > Yes, otherwise nothing would work on x86-64 with mmap broken, but for
> > cleanness and for the case this change it should be there (otherwise why
> > asmlinkage is used in the rest of the file).
>
> Only because it was cut'n'pasted from i386 originally.
>
> > And for i386 asmlinkage acquired significance only recently.
>
> Actually it doesn't neither on i386. That's because entry.S happens to put
> the arguments both into registers and the stack in the right order, so both
> register and stack argument calling conventions work.
>
> But it is slightly safer to have it. When you use the stack arguments
> the C code is allowed to modify it, and when the system call is restarted
> later you could see garbage. In practice that's not a big issue because
> only very few system calls are restartable.
>
> ptrace also could see corrupted state, but that's in general a non issue.
Ok, thanks for the info, I hope it's applied anyway.
-- 
Paolo Giarrusso, aka Blaisorblade
Linux registered user n. 292729
http://www.user-mode-linux.org/~blaisorblade


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.Stable and EXTRAVERSION

2005-03-09 Thread Chris Friesen
Justin M. Forbes wrote:
With the new stable series kernels, the .x versioning is being added to
EXTRAVERSION.  This has traditionally been a space for local modification.
I know several distributions are using EXTRAVERSION for build numbers,
platform and assorted other information to differentiate their kernel
releases.
I would propose that the new stable series kernels move the .x version
information somewhere more official.  I certainly do not mind throwing
together a patch to support DOTVERSION or what ever people want to call it.
Is anyone opposed to such a change?
Distros could conceivably use CONFIG_LOCALVERSION, although it might be 
cleaner to add another version level.

Chris
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/1] x86-64: forgot asmlinkage on sys_mmap

2005-03-09 Thread Andi Kleen
On Wed, Mar 09, 2005 at 07:24:00PM +0100, Blaisorblade wrote:
> On Wednesday 09 March 2005 18:24, Andi Kleen wrote:
> > [EMAIL PROTECTED] writes:
> > > CC: Andi Kleen <[EMAIL PROTECTED]>
> > >
> > > I think it should be there, please check better.
> >
> > It doesn't matter. asmlinkage is a nop on x86-64.
> 
> Yes, otherwise nothing would work on x86-64 with mmap broken, but for 
> cleanness and for the case this change it should be there (otherwise why 
> asmlinkage is used in the rest of the file).

Only because it was cut'n'pasted from i386 originally.

> 
> And for i386 asmlinkage acquired significance only recently.

Actually it doesn't neither on i386. That's because entry.S happens to put the 
arguments both into registers and the stack in the right order, so both 
register and stack argument calling conventions work.

But it is slightly safer to have it. When you use the stack arguments
the C code is allowed to modify it, and when the system call is restarted
later you could see garbage. In practice that's not a big issue because
only very few system calls are restartable.

ptrace also could see corrupted state, but that's in general a non issue.

-Andi

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [ patch 6/7] drivers/serial/jsm: new serial device driver

2005-03-09 Thread Kilau, Scott

> > DPA support is a requirement for all Digi drivers, so it would
> > not be possible for me to remove them from my "dgnc" version
> > of the driver.

> "requirement" from whom and to who?  The Linux kernel community?

>From our customers who are moving from other OS's to Linux,
and expect DPA support to be under Linux as well.

> It's not a reservation issue, it's the fact that we don't want to
allow
> new ioctls, and if we do, they had better work properly (your
> implementation does not.)
> 
> thanks,
>
> greg k-h

Which is fine and I accept the blame for.

This is something Wendy can change and fix.
I am explaining why they exist today and my
argument of why we need them to stay.

As it stands today, your requirement appears to be that she needs
to yank all diags ioctls and sysfs files before the driver can make
it into the kernel sources.

This is also fine, but Wendy and IBM will need to decide whether
all our diags utilties are needed for the JSM driver or not.

Scott
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rwsem: Make rwsems use interrupt disabling spinlocks

2005-03-09 Thread Badari Pulavarty
Well, aio-stress seems to run better with your patch (no Oops) but
I think we still have a problem in AIO. It looks like aio-stress
is stuck (unable to kill it).

Here is the sysrq-t output:

aio-stressD 8101be224970 0 15430  1   15429
(NOTLB)
8101be21bd58 0082 8101be21be18 0001
   8101be2248b0 81010074 007a
810198868e90
   8101d6daaf50 8101d6dab160
Call Trace:{__down+152}
{default_wake_function+0}
   {__down_failed+53}
{.text.lock.filemap+65}
   {aio_pwrite+0}
{aio_pwrite+33}
   {__aio_run_iocbs+384}
{io_submit_one+494}
   {sys_io_submit+217}
{system_call+126}


Top shows:

top - 12:22:33 up  2:57,  2 users,  load average: 5.08, 5.08, 5.01
Tasks:  79 total,   1 running,  77 sleeping,   0 stopped,   1 zombie
Cpu(s):  0.0% us, 25.0% sy,  0.0% ni, 75.0% id,  0.0% wa,  0.0% hi, 
0.0% si
Mem:   7148100k total,   176708k used,  6971392k free,18600k buffers
Swap:  1048784k total,0k used,  1048784k free,44708k cached


  PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND
15425 root  16   0 000 Z 99.8  0.0 172:17.98 aio-stress

15803 root  16   0  4048 1116  820 R  0.3  0.0   0:00.51 top



Thanks,
Badari


On Wed, 2005-03-09 at 09:35, Badari Pulavarty wrote:
> Your patch seems to have helped. I don't see the Oops anymore - my
> tests are still running (past 1 hour - it used to panic in 10 min).
> 
> Thanks,
> Badari
> 
> On Wed, 2005-03-09 at 04:12, David Howells wrote:
> > The attached patch makes read/write semaphores use interrupt disabling
> > spinlocks, thus rendering the up functions and trylock functions available 
> > for
> > use in interrupt context.
> > 
> > I've assumed that the normal down functions must be called with interrupts
> > enabled (since they might schedule), and used the irq-disabling spinlock
> > variants that don't save the flags.
> > 
> > Signed-Off-By: David Howells <[EMAIL PROTECTED]>
> > ---
> > warthog>diffstat -p1 rwsem-irqspin-2611mm2.diff
> 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to [EMAIL PROTECTED]  For more info on Linux AIO,
> see: http://www.kvack.org/aio/
> Don't email: mailto:"[EMAIL PROTECTED]">[EMAIL PROTECTED]
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] -stable, how it's going to work.

2005-03-09 Thread Andi Kleen
On Wed, Mar 09, 2005 at 06:00:45PM +, Alan Cox wrote:
> On Mer, 2005-03-09 at 09:56, Andi Kleen wrote:
> > - It must be accepted to mainline. 
> 
> Strongly disagree. What if the mainline fix is a rewrite of the core API
> involved. Some times you need to put in the short term fix. What must
> never happen is people accepting that fix as long term.
> 
> How about
> 
>  - It must be accepted to mainline, or the accepted mainline patch be
> deemed too complex or risky to backport and thus a simple obvious
> alternative fix applied to stable ONLY.

That is what I wrote later in my mail anyways (did you really read it 
completely?:)  See also the followup discussion with Russel and Arjan.

In general stable specific fixes should be the exception, not the rule though.

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rwsem: Make rwsems use interrupt disabling spinlocks

2005-03-09 Thread Badari Pulavarty
Hi,

I am not sure if this is related to your patch. But I ran into
BUG() with sysrq-t with your patch.

Thanks,
Badari

BUG: soft lockup detected on CPU#1!

Modules linked in: joydev sg st floppy usbserial parport_pc lp parport
ipv6 ohci_hcd i2c_amd756 i2c_core evdev usbcore raid0 dm_mod nls_utf8
Pid: 15433, comm: bash Not tainted 2.6.11-mm1n
RIP: 0010:[] {__do_softirq+84}
RSP: 0018:8101dff83f68  EFLAGS: 0206
RAX: 80651880 RBX: 0002 RCX: 0004
RDX: 0002 RSI: 0103 RDI: 8101d7c77680
RBP: 810177ffbe48 R08: 0002 R09: 0100
R10: 0001 R11:  R12: 0001
R13: 2aafb000 R14: 000a R15: 0001
FS:  2b2890a0() GS:80651880()
knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 2aafb000 CR3: 0001bb2a CR4: 06e0
  
Call Trace: {do_softirq+53}
{apic_timer_interrupt+133}
 {_spin_unlock_irqrestore+5}
   {write_sysrq_trigger+55}
{vfs_write+233}
   {sys_write+83}
{system_call+126}


On Wed, 2005-03-09 at 04:12, David Howells wrote:
> The attached patch makes read/write semaphores use interrupt disabling
> spinlocks, thus rendering the up functions and trylock functions available for
> use in interrupt context.
> 
> I've assumed that the normal down functions must be called with interrupts
> enabled (since they might schedule), and used the irq-disabling spinlock
> variants that don't save the flags.
> 
> Signed-Off-By: David Howells <[EMAIL PROTECTED]>


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Reading large /proc entry from kernel module

2005-03-09 Thread Bob Bennett
Kristian SÃrensen  cs.aau.dk> writes:

> 
> Hi all!
> 
> I have some trouble reading a 2346 byte /proc entry from our Umbrella kernel 
> module.

>   if (count != UMB_POLICY_SIZE) {
>   printk("Umbrella: Error - /proc/umbrella is of invalid size\n");
>   return -EFAULT;

>   if (copy_from_user(lbuf, buffer, count)) {
>   kfree(lbuf);
>   kfree(policy);
>   return -EFAULT;
>   }
> 
>   strcpy(policy, lbuf);
>   umb_parse_proc(policy);
> 
> }
> 

> Now that everything works, I want to write a string of excactly 2346 
> characters to the /proc/umbrella file. However when I make the 
> copy_from_user, I only get the first 1003 characters (
> - Do you have a pointer to where I do this thing wrong?
> 
> What is the limit regarding the size of writing a /proc entry? (we consider 
> importing binary public keys to the kernel this way in the future).
> 
> Best regards,
> Kristian.
> 

What makes you think you only have 1003 bytes?  If UMB_POLICY_SIZE is defined as
2346, then user space must have written that amount.  Probably the problem is
that you used strcpy() to copy the data from lbuf to policy, and there is a null
character after 1003 bytes.  It is an unnecessary extra step to allocate two
buffers (lbuf & policy) and copy data from one to the other.  Why not just pass
lbuff to umb_parse_proc()??

Regards,
   Bob Bennett

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ patch 6/7] drivers/serial/jsm: new serial device driver

2005-03-09 Thread Greg KH
On Wed, Mar 09, 2005 at 11:42:31AM -0600, Kilau, Scott wrote:
> Hi Wendy, Greg, all,
> 
> If IBM intends on our DPA management program to work for the JSM
> products, the ioctls are needed.

Wendy, what is IBM's stance on this?

> DPA support is a requirement for all Digi drivers, so it would
> not be possible for me to remove them from my "dgnc" version
> of the driver.

"requirement" from whom and to who?  The Linux kernel community?

> For the JSM driver, its up to you whether you feel its needed or not.
> 
> However, I would like to mention that the DIGI drivers that currently
> reside in the kernel sources *do* reserve that ioctl space,
> and is acknowledged by "Documentation/ioctl-number.txt":
> > d' F0-FF   linux/digi1.h
> 
> I understand that the list is not a reservation list,
> but a current list of potential ioctl conflicts...

It's not a reservation issue, it's the fact that we don't want to allow
new ioctls, and if we do, they had better work properly (your
implementation does not.)

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Linux-fbdev-devel] [announce 0/7] fbsplash - The Framebuffer Splash

2005-03-09 Thread Alan Cox
On Mer, 2005-03-09 at 09:34, Geert Uytterhoeven wrote:
> On Wed, 9 Mar 2005, Jon Smirl wrote:
> > Another idea would be to build a console is user space. Think of it as
> > a full screen xterm. A user space console has access to full hardware
> > acceleration using the DRM interface.
> 
> Yep. And that's what Alan Cox wanted to do. Console in userspace, eye candy
> (using Porter-Duff blending) as much as you want, full UTF-8 support, ...

Jon is the origin of those ideas not me, I'm merely supporting them
providing there is still a basic kernel side console.

Alan

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 4/5] audit mips fix

2005-03-09 Thread Greg KH
On Fri, Mar 04, 2005 at 01:16:57PM -0800, [EMAIL PROTECTED] wrote:
> 
> From: Yoichi Yuasa <[EMAIL PROTECTED]>
> 
>   CC  arch/mips/kernel/ptrace.o
> arch/mips/kernel/ptrace.c: In function 'do_syscall_trace':
> arch/mips/kernel/ptrace.c:310: warning: implicit declaration of function 
> 'audit_syscall_entry'
> arch/mips/kernel/ptrace.c:310: error: 'struct pt_regs' has no member named 
> 'orig_eax'
> arch/mips/kernel/ptrace.c:314: warning: implicit declaration of function 
> 'audit_syscall_exit'
> 
> Signed-off-by: Yoichi Yuasa <[EMAIL PROTECTED]>
> Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>

Added to the -stable queue, thanks.

greg k-h

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ppc32: trivial fix for e500 oprofile build

2005-03-09 Thread Greg KH
On Fri, Mar 04, 2005 at 01:09:18PM -0600, Kumar Gala wrote:
> Andrew, Greg
> 
> Here is a patch for the new 2.6.11 release tree and for Linus.
> 
> Fix for trivial fix for 2.6.11 oprofile compilation on e500 based ppc.
> 
> Signed-off-by: Andy Fleming <[EMAIL PROTECTED]>
> Signed-off-by: Kumar Gala <[EMAIL PROTECTED]>

Added to the -stable queue, thanks.

greg k-h

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


huge filesystems

2005-03-09 Thread Dan Stromberg

The group I work in has been experimenting with GFS and Lustre, and I did
some NBD/ENBD experimentation on my own, described at
http://dcs.nac.uci.edu/~strombrg/nbd.html

My question is, what is the current status of huge filesystems - IE,
filesystems that exceed 2 terabytes, and hopefully also exceeding 16
terabytes?

Am I correct in assuming that the usual linux buffer cache only goes to 16
terabytes?

Does the FUSE API (or similar) happen to allow surpassing either the 2T or
16T limits?

What about the "LBD" patches - what limits are involved there, and have
they been rolled into a Linus kernel, or one or more vendor kernels?

Thanks!


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] remove dead cyrix/centaur mtrr init code

2005-03-09 Thread Andries Brouwer
On Wed, Mar 09, 2005 at 04:55:27PM +, Alan Cox wrote:

> > [PATCH] remove dead cyrix/centaur mtrr init code
> 
> This patch was discussed previously and declared incorrect. The ->init
> method call is missing in the base mtrr code.
> 
> Should be reverted and/or fixed properly.

Hi Alan - a surprising reaction.

The patch is an improvement - it #ifdef's out some dead code.
I sent you a follow-up patch that activates the dead code,
since you said

  I have one here running 2.4 still. I can test a 2.6 fix
  for the mtrr init happily enough.

But so far you have not replied.
The moment you report that the follow-up patch is fine, we can
remove the #if 0 and insert the initcalls instead.

So, all is well today, and we are waiting for your report.

Andries
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5   6   7   8   9   10   >