[patch 03/50] read_zero_pagealigned() locking fix

2007-01-05 Thread Chris Wright
-stable review patch.  If anyone has any objections, please let us know.
--

From: Hugh Dickins <[EMAIL PROTECTED]>

Ramiro Voicu hits the BUG_ON(!pte_none(*pte)) in zeromap_pte_range: kernel
bugzilla 7645.  Right: read_zero_pagealigned uses down_read of mmap_sem,
but another thread's racing read of /dev/zero, or a normal fault, can
easily set that pte again, in between zap_page_range and zeromap_page_range
getting there.  It's been wrong ever since 2.4.3.

The simple fix is to use down_write instead, but that would serialize reads
of /dev/zero more than at present: perhaps some app would be badly
affected.  So instead let zeromap_page_range return the error instead of
BUG_ON, and read_zero_pagealigned break to the slower clear_user loop in
that case - there's no need to optimize for it.

Use -EEXIST for when a pte is found: BUG_ON in mmap_zero (the other user of
zeromap_page_range), though it really isn't interesting there.  And since
mmap_zero wants -EAGAIN for out-of-memory, the zeromaps better return that
than -ENOMEM.

Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]>
Cc: Ramiro Voicu: <[EMAIL PROTECTED]>
Cc: <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Chris Wright <[EMAIL PROTECTED]>
---

 drivers/char/mem.c |   12 
 mm/memory.c|   32 +---
 2 files changed, 29 insertions(+), 15 deletions(-)

--- linux-2.6.19.1.orig/drivers/char/mem.c
+++ linux-2.6.19.1/drivers/char/mem.c
@@ -646,7 +646,8 @@ static inline size_t read_zero_pagealign
count = size;
 
zap_page_range(vma, addr, count, NULL);
-   zeromap_page_range(vma, addr, count, PAGE_COPY);
+   if (zeromap_page_range(vma, addr, count, PAGE_COPY))
+   break;
 
size -= count;
buf += count;
@@ -713,11 +714,14 @@ out:
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+   int err;
+
if (vma->vm_flags & VM_SHARED)
return shmem_zero_setup(vma);
-   if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, 
vma->vm_page_prot))
-   return -EAGAIN;
-   return 0;
+   err = zeromap_page_range(vma, vma->vm_start,
+   vma->vm_end - vma->vm_start, vma->vm_page_prot);
+   BUG_ON(err == -EEXIST);
+   return err;
 }
 #else /* CONFIG_MMU */
 static ssize_t read_zero(struct file * file, char * buf, 
--- linux-2.6.19.1.orig/mm/memory.c
+++ linux-2.6.19.1/mm/memory.c
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_s
 {
pte_t *pte;
spinlock_t *ptl;
+   int err = 0;
 
pte = pte_alloc_map_lock(mm, pmd, addr, );
if (!pte)
-   return -ENOMEM;
+   return -EAGAIN;
arch_enter_lazy_mmu_mode();
do {
struct page *page = ZERO_PAGE(addr);
pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+
+   if (unlikely(!pte_none(*pte))) {
+   err = -EEXIST;
+   pte++;
+   break;
+   }
page_cache_get(page);
page_add_file_rmap(page);
inc_mm_counter(mm, file_rss);
-   BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
-   return 0;
+   return err;
 }
 
 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(stru
 {
pmd_t *pmd;
unsigned long next;
+   int err;
 
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
-   return -ENOMEM;
+   return -EAGAIN;
do {
next = pmd_addr_end(addr, end);
-   if (zeromap_pte_range(mm, pmd, addr, next, prot))
-   return -ENOMEM;
+   err = zeromap_pte_range(mm, pmd, addr, next, prot);
+   if (err)
+   break;
} while (pmd++, addr = next, addr != end);
-   return 0;
+   return err;
 }
 
 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(stru
 {
pud_t *pud;
unsigned long next;
+   int err;
 
pud = pud_alloc(mm, pgd, addr);
if (!pud)
-   return -ENOMEM;
+   return -EAGAIN;
do {
next = pud_addr_end(addr, end);
-   if (zeromap_pmd_range(mm, pud, addr, next, prot))
-   return -ENOMEM;
+   err = zeromap_pmd_range(mm, pud, addr, next, prot);
+   if (err)
+   break;
} while (pud++, addr = next, addr != end);
-   return 0;
+   return 

[patch 03/50] read_zero_pagealigned() locking fix

2007-01-05 Thread Chris Wright
-stable review patch.  If anyone has any objections, please let us know.
--

From: Hugh Dickins [EMAIL PROTECTED]

Ramiro Voicu hits the BUG_ON(!pte_none(*pte)) in zeromap_pte_range: kernel
bugzilla 7645.  Right: read_zero_pagealigned uses down_read of mmap_sem,
but another thread's racing read of /dev/zero, or a normal fault, can
easily set that pte again, in between zap_page_range and zeromap_page_range
getting there.  It's been wrong ever since 2.4.3.

The simple fix is to use down_write instead, but that would serialize reads
of /dev/zero more than at present: perhaps some app would be badly
affected.  So instead let zeromap_page_range return the error instead of
BUG_ON, and read_zero_pagealigned break to the slower clear_user loop in
that case - there's no need to optimize for it.

Use -EEXIST for when a pte is found: BUG_ON in mmap_zero (the other user of
zeromap_page_range), though it really isn't interesting there.  And since
mmap_zero wants -EAGAIN for out-of-memory, the zeromaps better return that
than -ENOMEM.

Signed-off-by: Hugh Dickins [EMAIL PROTECTED]
Cc: Ramiro Voicu: [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED]
Signed-off-by: Andrew Morton [EMAIL PROTECTED]
Signed-off-by: Chris Wright [EMAIL PROTECTED]
---

 drivers/char/mem.c |   12 
 mm/memory.c|   32 +---
 2 files changed, 29 insertions(+), 15 deletions(-)

--- linux-2.6.19.1.orig/drivers/char/mem.c
+++ linux-2.6.19.1/drivers/char/mem.c
@@ -646,7 +646,8 @@ static inline size_t read_zero_pagealign
count = size;
 
zap_page_range(vma, addr, count, NULL);
-   zeromap_page_range(vma, addr, count, PAGE_COPY);
+   if (zeromap_page_range(vma, addr, count, PAGE_COPY))
+   break;
 
size -= count;
buf += count;
@@ -713,11 +714,14 @@ out:
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+   int err;
+
if (vma-vm_flags  VM_SHARED)
return shmem_zero_setup(vma);
-   if (zeromap_page_range(vma, vma-vm_start, vma-vm_end - vma-vm_start, 
vma-vm_page_prot))
-   return -EAGAIN;
-   return 0;
+   err = zeromap_page_range(vma, vma-vm_start,
+   vma-vm_end - vma-vm_start, vma-vm_page_prot);
+   BUG_ON(err == -EEXIST);
+   return err;
 }
 #else /* CONFIG_MMU */
 static ssize_t read_zero(struct file * file, char * buf, 
--- linux-2.6.19.1.orig/mm/memory.c
+++ linux-2.6.19.1/mm/memory.c
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_s
 {
pte_t *pte;
spinlock_t *ptl;
+   int err = 0;
 
pte = pte_alloc_map_lock(mm, pmd, addr, ptl);
if (!pte)
-   return -ENOMEM;
+   return -EAGAIN;
arch_enter_lazy_mmu_mode();
do {
struct page *page = ZERO_PAGE(addr);
pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+
+   if (unlikely(!pte_none(*pte))) {
+   err = -EEXIST;
+   pte++;
+   break;
+   }
page_cache_get(page);
page_add_file_rmap(page);
inc_mm_counter(mm, file_rss);
-   BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
-   return 0;
+   return err;
 }
 
 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(stru
 {
pmd_t *pmd;
unsigned long next;
+   int err;
 
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
-   return -ENOMEM;
+   return -EAGAIN;
do {
next = pmd_addr_end(addr, end);
-   if (zeromap_pte_range(mm, pmd, addr, next, prot))
-   return -ENOMEM;
+   err = zeromap_pte_range(mm, pmd, addr, next, prot);
+   if (err)
+   break;
} while (pmd++, addr = next, addr != end);
-   return 0;
+   return err;
 }
 
 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(stru
 {
pud_t *pud;
unsigned long next;
+   int err;
 
pud = pud_alloc(mm, pgd, addr);
if (!pud)
-   return -ENOMEM;
+   return -EAGAIN;
do {
next = pud_addr_end(addr, end);
-   if (zeromap_pmd_range(mm, pud, addr, next, prot))
-   return -ENOMEM;
+   err = zeromap_pmd_range(mm, pud, addr, next, prot);
+   if (err)
+   break;
} while (pud++, addr = next, addr != end);
-   return 0;
+   return err;
 }
 
 int