[tip:x86/uv] x86/uv: Update the UV3 TLB shootdown logic

2014-06-05 Thread tip-bot for Cliff Wickman
Commit-ID:  a26fd71953711acb4884df84e393d52de57e4f17
Gitweb: http://git.kernel.org/tip/a26fd71953711acb4884df84e393d52de57e4f17
Author: Cliff Wickman 
AuthorDate: Wed, 14 May 2014 16:15:47 -0500
Committer:  Ingo Molnar 
CommitDate: Thu, 5 Jun 2014 14:17:20 +0200

x86/uv: Update the UV3 TLB shootdown logic

Update of TLB shootdown code for UV3.

Kernel function native_flush_tlb_others() calls
uv_flush_tlb_others() on UV to invalidate tlb page definitions
on remote cpus. The UV systems have a hardware 'broadcast assist
unit' which can be used to broadcast shootdown messages to all
cpu's of selected nodes.

The behavior of the BAU has changed only slightly with UV3:

  - UV3 is recognized with is_uv3_hub().
  - UV2 functions and structures (uv2_xxx) are in most cases
simply renamed to uv2_3_xxx.
  - Some UV2 error workarounds are not needed for UV3.
(see uv_bau_message_interrupt and enable_timeouts)

Signed-off-by: Cliff Wickman 
Link: http://lkml.kernel.org/r/e1wkgwh-0001yj...@eag09.americas.sgi.com
[ Removed a few linebreak uglies. ]
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/uv/uv_bau.h | 19 ++-
 arch/x86/platform/uv/tlb_uv.c| 69 ++--
 2 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0b46ef2..2d60a78 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -73,6 +73,7 @@
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD(is_uv1_hub() ? 
\
UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD :  \
UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
 
 #define BAU_MISC_CONTROL_MULT_MASK 3
 
@@ -93,6 +94,8 @@
 #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
 #define SOFTACK_PSHIFT 
UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
 #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT 
UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
 #define write_gmmr uv_write_global_mmr64
 #define write_lmmr uv_write_local_mmr
 #define read_lmmr  uv_read_local_mmr
@@ -322,8 +325,9 @@ struct uv1_bau_msg_header {
 /*
  * UV2 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
  * see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
  */
-struct uv2_bau_msg_header {
+struct uv2_3_bau_msg_header {
unsigned intbase_dest_nasid:15; /* nasid of the first bit */
/* bits 14:0 */ /* in uvhub map */
unsigned intdest_subnodeid:5;   /* must be 0x10, for the LB */
@@ -395,7 +399,7 @@ struct bau_desc {
 */
union bau_msg_header {
struct uv1_bau_msg_header   uv1_hdr;
-   struct uv2_bau_msg_header   uv2_hdr;
+   struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
 
struct bau_msg_payload  payload;
@@ -631,11 +635,6 @@ struct bau_control {
struct hub_and_pnode*thp;
 };
 
-static inline unsigned long read_mmr_uv2_status(void)
-{
-   return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
-}
-
 static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
 {
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
@@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct 
atomic_short *v)
  */
 static inline int atom_asr(short i, struct atomic_short *v)
 {
-   return i + xadd(>counter, i);
+   short __i = i;
+   asm volatile(LOCK_PREFIX "xaddw %0, %1"
+   : "+r" (i), "+m" (v->counter)
+   : : "memory");
+   return i + __i;
 }
 
 /*
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dfe605a..ed161c6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  * SGI UltraViolet TLB flush routines.
  *
- * (c) 2008-2012 Cliff Wickman , SGI.
+ * (c) 2008-2014 Cliff Wickman , SGI.
  *
  * This code is released under the GNU General Public License version 2 or
  * later.
@@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long 
long cyc)
 
 /*
  * The reverse of the above; converts a duration in ns to a duration in cycles.
- */ 
+ */
 static inline unsigned long long ns_2_cycles(unsigned long long ns)
 {
struct cyc2ns_data *data = cyc2ns_read_begin();
@@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
  * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
  * But not currently used.
  */
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
+static unsigned long uv2_3_read_status(unsigned long off

[tip:x86/uv] x86/uv: Update the UV3 TLB shootdown logic

2014-06-05 Thread tip-bot for Cliff Wickman
Commit-ID:  a26fd71953711acb4884df84e393d52de57e4f17
Gitweb: http://git.kernel.org/tip/a26fd71953711acb4884df84e393d52de57e4f17
Author: Cliff Wickman c...@sgi.com
AuthorDate: Wed, 14 May 2014 16:15:47 -0500
Committer:  Ingo Molnar mi...@kernel.org
CommitDate: Thu, 5 Jun 2014 14:17:20 +0200

x86/uv: Update the UV3 TLB shootdown logic

Update of TLB shootdown code for UV3.

Kernel function native_flush_tlb_others() calls
uv_flush_tlb_others() on UV to invalidate tlb page definitions
on remote cpus. The UV systems have a hardware 'broadcast assist
unit' which can be used to broadcast shootdown messages to all
cpu's of selected nodes.

The behavior of the BAU has changed only slightly with UV3:

  - UV3 is recognized with is_uv3_hub().
  - UV2 functions and structures (uv2_xxx) are in most cases
simply renamed to uv2_3_xxx.
  - Some UV2 error workarounds are not needed for UV3.
(see uv_bau_message_interrupt and enable_timeouts)

Signed-off-by: Cliff Wickman c...@sgi.com
Link: http://lkml.kernel.org/r/e1wkgwh-0001yj...@eag09.americas.sgi.com
[ Removed a few linebreak uglies. ]
Signed-off-by: Ingo Molnar mi...@kernel.org
---
 arch/x86/include/asm/uv/uv_bau.h | 19 ++-
 arch/x86/platform/uv/tlb_uv.c| 69 ++--
 2 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0b46ef2..2d60a78 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -73,6 +73,7 @@
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD(is_uv1_hub() ? 
\
UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD :  \
UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
 
 #define BAU_MISC_CONTROL_MULT_MASK 3
 
@@ -93,6 +94,8 @@
 #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
 #define SOFTACK_PSHIFT 
UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
 #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT 
UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
 #define write_gmmr uv_write_global_mmr64
 #define write_lmmr uv_write_local_mmr
 #define read_lmmr  uv_read_local_mmr
@@ -322,8 +325,9 @@ struct uv1_bau_msg_header {
 /*
  * UV2 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
  * see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
  */
-struct uv2_bau_msg_header {
+struct uv2_3_bau_msg_header {
unsigned intbase_dest_nasid:15; /* nasid of the first bit */
/* bits 14:0 */ /* in uvhub map */
unsigned intdest_subnodeid:5;   /* must be 0x10, for the LB */
@@ -395,7 +399,7 @@ struct bau_desc {
 */
union bau_msg_header {
struct uv1_bau_msg_header   uv1_hdr;
-   struct uv2_bau_msg_header   uv2_hdr;
+   struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
 
struct bau_msg_payload  payload;
@@ -631,11 +635,6 @@ struct bau_control {
struct hub_and_pnode*thp;
 };
 
-static inline unsigned long read_mmr_uv2_status(void)
-{
-   return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
-}
-
 static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
 {
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
@@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct 
atomic_short *v)
  */
 static inline int atom_asr(short i, struct atomic_short *v)
 {
-   return i + xadd(v-counter, i);
+   short __i = i;
+   asm volatile(LOCK_PREFIX xaddw %0, %1
+   : +r (i), +m (v-counter)
+   : : memory);
+   return i + __i;
 }
 
 /*
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dfe605a..ed161c6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  * SGI UltraViolet TLB flush routines.
  *
- * (c) 2008-2012 Cliff Wickman c...@sgi.com, SGI.
+ * (c) 2008-2014 Cliff Wickman c...@sgi.com, SGI.
  *
  * This code is released under the GNU General Public License version 2 or
  * later.
@@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long 
long cyc)
 
 /*
  * The reverse of the above; converts a duration in ns to a duration in cycles.
- */ 
+ */
 static inline unsigned long long ns_2_cycles(unsigned long long ns)
 {
struct cyc2ns_data *data = cyc2ns_read_begin();
@@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
  * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
  * But not currently used.
  */
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
+static unsigned long

Re: [PATCH v5 1/5] vmcore: Introduce ELF header in new memory feature

2013-06-27 Thread Cliff Wickman
 notes_section = kmalloc(max_sz, GFP_KERNEL);
> > if (!notes_section)
> > return -ENOMEM;
> > -   rc = read_from_oldmem(notes_section, max_sz, , 0);
> > +   rc = elfcorehdr_read_notes(notes_section, max_sz, );
> > if (rc < 0) {
> > kfree(notes_section);
> > return rc;
> > @@ -409,7 +439,8 @@ static int __init copy_notes_elf64(const
> > if (phdr_ptr->p_type != PT_NOTE)
> > continue;
> > offset = phdr_ptr->p_offset;
> > -   rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, , 0);
> > +   rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
> > +  );
> > if (rc < 0)
> > return rc;
> > notes_buf += phdr_ptr->p_memsz;
> > @@ -510,7 +541,7 @@ static int __init update_note_header_siz
> > notes_section = kmalloc(max_sz, GFP_KERNEL);
> > if (!notes_section)
> > return -ENOMEM;
> > -   rc = read_from_oldmem(notes_section, max_sz, , 0);
> > +   rc = elfcorehdr_read_notes(notes_section, max_sz, );
> > if (rc < 0) {
> > kfree(notes_section);
> > return rc;
> > @@ -597,7 +628,8 @@ static int __init copy_notes_elf32(const
> > if (phdr_ptr->p_type != PT_NOTE)
> > continue;
> > offset = phdr_ptr->p_offset;
> > -   rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, , 0);
> > +   rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
> > +  );
> > if (rc < 0)
> > return rc;
> > notes_buf += phdr_ptr->p_memsz;
> > @@ -793,7 +825,7 @@ static int __init parse_crash_elf64_head
> > addr = elfcorehdr_addr;
> >  
> > /* Read Elf header */
> > -   rc = read_from_oldmem((char*), sizeof(Elf64_Ehdr), , 0);
> > +   rc = elfcorehdr_read((char *), sizeof(Elf64_Ehdr), );
> > if (rc < 0)
> > return rc;
> >  
> > @@ -820,7 +852,7 @@ static int __init parse_crash_elf64_head
> > if (!elfcorebuf)
> > return -ENOMEM;
> > addr = elfcorehdr_addr;
> > -   rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, , 0);
> > +   rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, );
> > if (rc < 0)
> > goto fail;
> >  
> > @@ -849,7 +881,7 @@ static int __init parse_crash_elf32_head
> > addr = elfcorehdr_addr;
> >  
> > /* Read Elf header */
> > -   rc = read_from_oldmem((char*), sizeof(Elf32_Ehdr), , 0);
> > +   rc = elfcorehdr_read((char *), sizeof(Elf32_Ehdr), );
> > if (rc < 0)
> > return rc;
> >  
> > @@ -875,7 +907,7 @@ static int __init parse_crash_elf32_head
> > if (!elfcorebuf)
> > return -ENOMEM;
> > addr = elfcorehdr_addr;
> > -   rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, , 0);
> > +   rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, );
> > if (rc < 0)
> > goto fail;
> >  
> > @@ -902,7 +934,7 @@ static int __init parse_crash_elf_header
> > int rc=0;
> >  
> > addr = elfcorehdr_addr;
> > -   rc = read_from_oldmem(e_ident, EI_NIDENT, , 0);
> > +   rc = elfcorehdr_read(e_ident, EI_NIDENT, );
> > if (rc < 0)
> > return rc;
> > if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
> > @@ -935,7 +967,14 @@ static int __init vmcore_init(void)
> >  {
> > int rc = 0;
> >  
> > -   /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
> > +   /* Allow architectures to allocate ELF header in 2nd kernel */
> > +   rc = elfcorehdr_alloc(_addr, _size);
> > +   if (rc)
> > +   return rc;
> > +   /*
> > +* If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
> > +* then capture the dump.
> > +*/
> > if (!(is_vmcore_usable()))
> > return rc;
> > rc = parse_crash_elf_headers();
> > @@ -943,7 +982,11 @@ static int __init vmcore_init(void)
> > pr_warn("Kdump: vmcore not initialized\n");
> > return rc;
> > }
> > -
> > +   elfcorehdr_free(elfcorehdr_addr);
> > +   /*
> > +* elfcorehdr_addr must not be set to NULL here to keep
> > +* is_kdump_kernel() working.
> > +*/
> > proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, 
> > _vmcore_operations);
> > if (proc_vmcore)
> > proc_vmcore->size = vmcore_size;
> > --- a/include/linux/crash_dump.h
> > +++ b/include/linux/crash_dump.h
> > @@ -12,6 +12,12 @@
> >  extern unsigned long long elfcorehdr_addr;
> >  extern unsigned long long elfcorehdr_size;
> >  
> > +extern int __weak elfcorehdr_alloc(unsigned long long *addr,
> > +  unsigned long long *size);
> > +extern void __weak elfcorehdr_free(unsigned long long addr);
> > +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos);
> > +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 
> > *ppos);
> > +
> >  extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
> > unsigned long, int);
> >  
> 
> ___
> kexec mailing list
> ke...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 1/5] vmcore: Introduce ELF header in new memory feature

2013-06-27 Thread Cliff Wickman
 = read_from_oldmem(notes_section, max_sz, offset, 0);
  +   rc = elfcorehdr_read_notes(notes_section, max_sz, offset);
  if (rc  0) {
  kfree(notes_section);
  return rc;
  @@ -597,7 +628,8 @@ static int __init copy_notes_elf32(const
  if (phdr_ptr-p_type != PT_NOTE)
  continue;
  offset = phdr_ptr-p_offset;
  -   rc = read_from_oldmem(notes_buf, phdr_ptr-p_memsz, offset, 0);
  +   rc = elfcorehdr_read_notes(notes_buf, phdr_ptr-p_memsz,
  +  offset);
  if (rc  0)
  return rc;
  notes_buf += phdr_ptr-p_memsz;
  @@ -793,7 +825,7 @@ static int __init parse_crash_elf64_head
  addr = elfcorehdr_addr;
   
  /* Read Elf header */
  -   rc = read_from_oldmem((char*)ehdr, sizeof(Elf64_Ehdr), addr, 0);
  +   rc = elfcorehdr_read((char *)ehdr, sizeof(Elf64_Ehdr), addr);
  if (rc  0)
  return rc;
   
  @@ -820,7 +852,7 @@ static int __init parse_crash_elf64_head
  if (!elfcorebuf)
  return -ENOMEM;
  addr = elfcorehdr_addr;
  -   rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, addr, 0);
  +   rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, addr);
  if (rc  0)
  goto fail;
   
  @@ -849,7 +881,7 @@ static int __init parse_crash_elf32_head
  addr = elfcorehdr_addr;
   
  /* Read Elf header */
  -   rc = read_from_oldmem((char*)ehdr, sizeof(Elf32_Ehdr), addr, 0);
  +   rc = elfcorehdr_read((char *)ehdr, sizeof(Elf32_Ehdr), addr);
  if (rc  0)
  return rc;
   
  @@ -875,7 +907,7 @@ static int __init parse_crash_elf32_head
  if (!elfcorebuf)
  return -ENOMEM;
  addr = elfcorehdr_addr;
  -   rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, addr, 0);
  +   rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, addr);
  if (rc  0)
  goto fail;
   
  @@ -902,7 +934,7 @@ static int __init parse_crash_elf_header
  int rc=0;
   
  addr = elfcorehdr_addr;
  -   rc = read_from_oldmem(e_ident, EI_NIDENT, addr, 0);
  +   rc = elfcorehdr_read(e_ident, EI_NIDENT, addr);
  if (rc  0)
  return rc;
  if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
  @@ -935,7 +967,14 @@ static int __init vmcore_init(void)
   {
  int rc = 0;
   
  -   /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
  +   /* Allow architectures to allocate ELF header in 2nd kernel */
  +   rc = elfcorehdr_alloc(elfcorehdr_addr, elfcorehdr_size);
  +   if (rc)
  +   return rc;
  +   /*
  +* If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
  +* then capture the dump.
  +*/
  if (!(is_vmcore_usable()))
  return rc;
  rc = parse_crash_elf_headers();
  @@ -943,7 +982,11 @@ static int __init vmcore_init(void)
  pr_warn(Kdump: vmcore not initialized\n);
  return rc;
  }
  -
  +   elfcorehdr_free(elfcorehdr_addr);
  +   /*
  +* elfcorehdr_addr must not be set to NULL here to keep
  +* is_kdump_kernel() working.
  +*/
  proc_vmcore = proc_create(vmcore, S_IRUSR, NULL, 
  proc_vmcore_operations);
  if (proc_vmcore)
  proc_vmcore-size = vmcore_size;
  --- a/include/linux/crash_dump.h
  +++ b/include/linux/crash_dump.h
  @@ -12,6 +12,12 @@
   extern unsigned long long elfcorehdr_addr;
   extern unsigned long long elfcorehdr_size;
   
  +extern int __weak elfcorehdr_alloc(unsigned long long *addr,
  +  unsigned long long *size);
  +extern void __weak elfcorehdr_free(unsigned long long addr);
  +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos);
  +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 
  *ppos);
  +
   extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
  unsigned long, int);
   
 
 ___
 kexec mailing list
 ke...@lists.infradead.org
 http://lists.infradead.org/mailman/listinfo/kexec

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-24 Thread Cliff Wickman



> On Thur, 23 May 2013 Andrew Morton  wrote:

> > On Wed, 15 May 2013 07:46:36 -0500 Cliff Wickman  wrote:
> > Certain tests in walk_page_range() (specifically split_huge_page_pmd())
> > assume that all the mapped PFN's are backed with page structures. And this 
> > is
> > not usually true for VM_PFNMAP areas. This can result in panics on kernel
> > page faults when attempting to address those page structures.
> > 
> > There are a half dozen callers of walk_page_range() that walk through
> > a task's entire page table (as N. Horiguchi pointed out). So rather than
> > change all of them, this patch changes just walk_page_range() to ignore 
> > VM_PFNMAP areas.
> > 
> > The logic of hugetlb_vma() is moved back into walk_page_range(), as we
> > want to test any vma in the range.
> > 
> > VM_PFNMAP areas are used by:
> > - graphics memory manager   gpu/drm/drm_gem.c
> > - global reference unit sgi-gru/grufile.c
> > - sgi special memorychar/mspec.c
> > - and probably several out-of-tree modules
> 
> What are your thoughts on the urgency/scheduling of this fix?

The panic can be caused by simply cat'ing /proc//smaps while an
application has a VM_PFNMAP range.  It happened in-house when a benchmarker
was trying to decipher the memory layout of his program.
So that makes it rather urgent from our point of view.
We would like to see the fix included in upcoming distro releases, and having
it upstream makes that much easier to accomplish.

> (Just to be irritating: "When writing a changelog, please describe the
> end-user-visible effects of the bug, so that others can more easily
> decide which kernel version(s) should be fixed, and so that downstream
> kernel maintainers can more easily work out whether this patch will fix
> a problem which they or their customers are observing.")

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-24 Thread Cliff Wickman



 On Thur, 23 May 2013 Andrew Morton a...@linux-foundation.org wrote:

  On Wed, 15 May 2013 07:46:36 -0500 Cliff Wickman c...@sgi.com wrote:
  Certain tests in walk_page_range() (specifically split_huge_page_pmd())
  assume that all the mapped PFN's are backed with page structures. And this 
  is
  not usually true for VM_PFNMAP areas. This can result in panics on kernel
  page faults when attempting to address those page structures.
  
  There are a half dozen callers of walk_page_range() that walk through
  a task's entire page table (as N. Horiguchi pointed out). So rather than
  change all of them, this patch changes just walk_page_range() to ignore 
  VM_PFNMAP areas.
  
  The logic of hugetlb_vma() is moved back into walk_page_range(), as we
  want to test any vma in the range.
  
  VM_PFNMAP areas are used by:
  - graphics memory manager   gpu/drm/drm_gem.c
  - global reference unit sgi-gru/grufile.c
  - sgi special memorychar/mspec.c
  - and probably several out-of-tree modules
 
 What are your thoughts on the urgency/scheduling of this fix?

The panic can be caused by simply cat'ing /proc/pid/smaps while an
application has a VM_PFNMAP range.  It happened in-house when a benchmarker
was trying to decipher the memory layout of his program.
So that makes it rather urgent from our point of view.
We would like to see the fix included in upcoming distro releases, and having
it upstream makes that much easier to accomplish.

 (Just to be irritating: When writing a changelog, please describe the
 end-user-visible effects of the bug, so that others can more easily
 decide which kernel version(s) should be fixed, and so that downstream
 kernel maintainers can more easily work out whether this patch will fix
 a problem which they or their customers are observing.)

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-15 Thread Cliff Wickman

/proc//smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

v2: 
- moves the VM_BUG_ON out of the loop
- adds the needed test for  vma->vm_start <= addr

v3 adds comments to make this clearer, as N. Horiguchi recommends:
  > I recommend that you check VM_PFNMAP in the possible callers' side.
  > But this patch seems to solve your problem, so with properly commenting
  > this somewhere, I do not oppose it.

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman 
---
 mm/pagewalk.c |   65 --
 1 file changed, 36 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk->hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
-   vma = find_vma(walk->mm, addr);
-   if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -198,30 +182,53 @@ int walk_page_range(unsigned long addr,
if (!walk->mm)
return -EINVAL;
 
+   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
+
pgd = pgd_offset(walk->mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* This function was not intended to be vma based.
+* But there are vma special cases to be handled:
+* - hugetlb vma's
+* - VM_PFNMAP vma's
 */
-   vma = hugetlb_vma(addr, walk);
+   vma = find_vma(walk->mm, addr);
if (vma) {
-   if (vma->vm_end < next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so do not allow split_huge_page_pmd().
+*/
+   if ((vma->vm_start <= addr) &&
+   (vma->vm_flags & VM_PFNMAP)) {
next = vma->vm_end;
+   pgd = pgd_offset(walk->mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk->mm, next);
-   continue;
+   if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+   is_vm_hugetlb_pa

[PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-15 Thread Cliff Wickman

/proc/pid/smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

v2: 
- moves the VM_BUG_ON out of the loop
- adds the needed test for  vma-vm_start = addr

v3 adds comments to make this clearer, as N. Horiguchi recommends:
   I recommend that you check VM_PFNMAP in the possible callers' side.
   But this patch seems to solve your problem, so with properly commenting
   this somewhere, I do not oppose it.

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman c...@sgi.com
---
 mm/pagewalk.c |   65 --
 1 file changed, 36 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk-hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
-   vma = find_vma(walk-mm, addr);
-   if (vma  vma-vm_start = addr  is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -198,30 +182,53 @@ int walk_page_range(unsigned long addr,
if (!walk-mm)
return -EINVAL;
 
+   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
+
pgd = pgd_offset(walk-mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* This function was not intended to be vma based.
+* But there are vma special cases to be handled:
+* - hugetlb vma's
+* - VM_PFNMAP vma's
 */
-   vma = hugetlb_vma(addr, walk);
+   vma = find_vma(walk-mm, addr);
if (vma) {
-   if (vma-vm_end  next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so do not allow split_huge_page_pmd().
+*/
+   if ((vma-vm_start = addr) 
+   (vma-vm_flags  VM_PFNMAP)) {
next = vma-vm_end;
+   pgd = pgd_offset(walk-mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk-mm, next);
-   continue;
+   if (walk-hugetlb_entry  (vma-vm_start = addr) 
+   is_vm_hugetlb_page(vma)) {
+   if (vma-vm_end  next)
+   next = vma-vm_end

makedumpfile mmap() benchmark

2013-05-03 Thread Cliff Wickman

> Jingbai Ma wote on 27 Mar 2013:
> I have tested the makedumpfile mmap patch on a machine with 2TB memory, 
> here is testing results:
> Test environment:
> Machine: HP ProLiant DL980 G7 with 2TB RAM.
> CPU: Intel(R) Xeon(R) CPU E7- 2860  @ 2.27GHz (8 sockets, 10 cores)
> (Only 1 cpu was enabled the 2nd kernel)
> Kernel: 3.9.0-rc3+ with mmap kernel patch v3
> vmcore size: 2.0TB
> Dump file size: 3.6GB
> makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 
> --map-size 
> All measured time from debug message of makedumpfile.
> 
> As a comparison, I also have tested with original kernel and original 
> makedumpfile 1.5.1 and 1.5.3.
> I added all [Excluding unnecessary pages] and [Excluding free pages] 
> time together as "Filter Pages", and [Copyying Data] as "Copy data" here.
> 
> makedumjpfile Kernel  map-size (KB)   Filter pages (s)Copy data (s)   
> Total (s)
> 1.5.1  3.7.0-0.36.el7.x86_64  N/A 940.28  1269.25 2209.53
> 1.5.3  3.7.0-0.36.el7.x86_64  N/A 380.09  992.77  1372.86
> 1.5.3 v3.9-rc3N/A 197.77  892.27  1090.04
> 1.5.3+mmapv3.9-rc3+mmap   0   164.87  606.06  770.93
> 1.5.3+mmapv3.9-rc3+mmap   4   88.62   576.07  664.69
> 1.5.3+mmapv3.9-rc3+mmap   102483.66   477.23  560.89
> 1.5.3+mmapv3.9-rc3+mmap   204883.44   477.21  560.65
> 1.5.3+mmapv3.9-rc3+mmap   10240   83.84   476.56  560.4

I have also tested the makedumpfile mmap patch on a machine with 2TB memory, 
here are the results:
Test environment:
Machine: SGI UV1000 with 2TB RAM.
CPU: Intel(R) Xeon(R) CPU E7- 8837  @ 2.67GHz
(only 1 cpu was enabled in the 2nd kernel)
Kernel: 3.0.13 with mmap kernel patch v3 (I had to tweak the patch a bit)
vmcore size: 2.0TB
Dump file size: 3.6GB
makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 
   --map-size 
All measured times are actual clock times.
All tests are noncyclic.   Crash kernel memory: crashkernel=512M

As did Jingbai Ma, I also tested with an unpatched kernel and
makedumpfile 1.5.1 and 1.5.3.  But they do 2 filtering scans: unnecessary
pages and free pages; here added together as filter pages time.

  FilterCopy
makedumpfile Kernel  map-size(KB) pages(s)  data(s) Total(s)
1.5.13.0.13N/A671   5111182
1.5.33.0.13N/A294   535 829
1.5.3+mmap   3.0.13+mmap 0 54   506 560
1.5.3+mmap   3.0.13+mmap  4096 40   416 456
1.5.3+mmap   3.0.13+mmap 10240 37   424 461

Using mmap for the copy data as well as for filtering pages did little:
1.5.3+mmap   3.0.13+mmap  4096 37   414 451

My results are quite similar to Jingbai Ma's.
The mmap patch to the kernel greatly speeds the filtering of pages, so
we at SGI would very much like to see this patch in the 3.10 kernel.
  http://marc.info/?l=linux-kernel=136627770125345=2

What puzzles me is that the patch greatly speeds the read's of /proc/vmcore
(where map-size is 0) as well as providing the mmap ability.  I can now
seek/read page structures almost as fast as mmap'ing and copying them.
(versus Jingbai Ma's results where mmap almost doubled the speed of reads)
I have put counters in to verify, and we are doing several million
seek/read's vs. a few thousand mmap's.  Yet the performance is similar
(54sec vs. 37sec, above). I can't rationalize that much improvement.

Thanks,
Cliff Wickman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


makedumpfile mmap() benchmark

2013-05-03 Thread Cliff Wickman

 Jingbai Ma wote on 27 Mar 2013:
 I have tested the makedumpfile mmap patch on a machine with 2TB memory, 
 here is testing results:
 Test environment:
 Machine: HP ProLiant DL980 G7 with 2TB RAM.
 CPU: Intel(R) Xeon(R) CPU E7- 2860  @ 2.27GHz (8 sockets, 10 cores)
 (Only 1 cpu was enabled the 2nd kernel)
 Kernel: 3.9.0-rc3+ with mmap kernel patch v3
 vmcore size: 2.0TB
 Dump file size: 3.6GB
 makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 
 --map-size map-size
 All measured time from debug message of makedumpfile.
 
 As a comparison, I also have tested with original kernel and original 
 makedumpfile 1.5.1 and 1.5.3.
 I added all [Excluding unnecessary pages] and [Excluding free pages] 
 time together as Filter Pages, and [Copyying Data] as Copy data here.
 
 makedumjpfile Kernel  map-size (KB)   Filter pages (s)Copy data (s)   
 Total (s)
 1.5.1  3.7.0-0.36.el7.x86_64  N/A 940.28  1269.25 2209.53
 1.5.3  3.7.0-0.36.el7.x86_64  N/A 380.09  992.77  1372.86
 1.5.3 v3.9-rc3N/A 197.77  892.27  1090.04
 1.5.3+mmapv3.9-rc3+mmap   0   164.87  606.06  770.93
 1.5.3+mmapv3.9-rc3+mmap   4   88.62   576.07  664.69
 1.5.3+mmapv3.9-rc3+mmap   102483.66   477.23  560.89
 1.5.3+mmapv3.9-rc3+mmap   204883.44   477.21  560.65
 1.5.3+mmapv3.9-rc3+mmap   10240   83.84   476.56  560.4

I have also tested the makedumpfile mmap patch on a machine with 2TB memory, 
here are the results:
Test environment:
Machine: SGI UV1000 with 2TB RAM.
CPU: Intel(R) Xeon(R) CPU E7- 8837  @ 2.67GHz
(only 1 cpu was enabled in the 2nd kernel)
Kernel: 3.0.13 with mmap kernel patch v3 (I had to tweak the patch a bit)
vmcore size: 2.0TB
Dump file size: 3.6GB
makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 
   --map-size map-size
All measured times are actual clock times.
All tests are noncyclic.   Crash kernel memory: crashkernel=512M

As did Jingbai Ma, I also tested with an unpatched kernel and
makedumpfile 1.5.1 and 1.5.3.  But they do 2 filtering scans: unnecessary
pages and free pages; here added together as filter pages time.

  FilterCopy
makedumpfile Kernel  map-size(KB) pages(s)  data(s) Total(s)
1.5.13.0.13N/A671   5111182
1.5.33.0.13N/A294   535 829
1.5.3+mmap   3.0.13+mmap 0 54   506 560
1.5.3+mmap   3.0.13+mmap  4096 40   416 456
1.5.3+mmap   3.0.13+mmap 10240 37   424 461

Using mmap for the copy data as well as for filtering pages did little:
1.5.3+mmap   3.0.13+mmap  4096 37   414 451

My results are quite similar to Jingbai Ma's.
The mmap patch to the kernel greatly speeds the filtering of pages, so
we at SGI would very much like to see this patch in the 3.10 kernel.
  http://marc.info/?l=linux-kernelm=136627770125345w=2

What puzzles me is that the patch greatly speeds the read's of /proc/vmcore
(where map-size is 0) as well as providing the mmap ability.  I can now
seek/read page structures almost as fast as mmap'ing and copying them.
(versus Jingbai Ma's results where mmap almost doubled the speed of reads)
I have put counters in to verify, and we are doing several million
seek/read's vs. a few thousand mmap's.  Yet the performance is similar
(54sec vs. 37sec, above). I can't rationalize that much improvement.

Thanks,
Cliff Wickman
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-02 Thread Cliff Wickman
On Thu, May 02, 2013 at 12:44:04PM -0400, Naoya Horiguchi wrote:
> On Thu, May 02, 2013 at 07:10:48AM -0500, Cliff Wickman wrote:
> > 
> > /proc//smaps and similar walks through a user page table should not
> > be looking at VM_PFNMAP areas.
> > 
> > This is v2: 
> > - moves the VM_BUG_ON out of the loop
> > - adds the needed test for  vma->vm_start <= addr
> > 
> > Certain tests in walk_page_range() (specifically split_huge_page_pmd())
> > assume that all the mapped PFN's are backed with page structures. And this 
> > is
> > not usually true for VM_PFNMAP areas. This can result in panics on kernel
> > page faults when attempting to address those page structures.
> > 
> > There are a half dozen callers of walk_page_range() that walk through
> > a task's entire page table (as N. Horiguchi pointed out). So rather than
> > change all of them, this patch changes just walk_page_range() to ignore 
> > VM_PFNMAP areas.
> > 
> > The logic of hugetlb_vma() is moved back into walk_page_range(), as we
> > want to test any vma in the range.
> > 
> > VM_PFNMAP areas are used by:
> > - graphics memory manager   gpu/drm/drm_gem.c
> > - global reference unit sgi-gru/grufile.c
> > - sgi special memorychar/mspec.c
> > - and probably several out-of-tree modules
> > 
> > I'm copying everyone who has changed this file recently, in case
> > there is some reason that I am not aware of to provide
> > /proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.
> > 
> > Signed-off-by: Cliff Wickman 
> 
> walk_page_range() does vma-based walk only for address ranges backed by
> hugetlbfs, and it doesn't see vma for address ranges backed by normal pages
> and thps (in those case we just walk over page table hierarchy).

Agreed, walk_page_range() only checks for a hugetlbfs-type vma as it
scans an address range.

The problem I'm seeing comes in when it calls walk_pud_range() for any address
range that is not within a hugetlbfs vma:
   walk_pmd_range()
 split_huge_page_pmd_mm()
   split_huge_page_pmd()
 __split_huge_page_pmd()
   page = pmd_page(*pmd)
And such a page structure does not exist for a VM_PFNMAP area.
 
> I think that vma-based walk was introduced as a kind of dirty hack to
> handle hugetlbfs, and it can be cleaned up in the future. So I'm afraid
> it's not a good idea to extend or adding code heavily depending on this hack.

walk_page_range() looks like generic infrastructure to scan any range
of a user's address space - as in /proc//smaps and similar. And the
hugetlbfs check seems to have been added as an exception.  
Huge page exceptional cases occur further down the chain.  And
when a corresponding page structure is needed for those cases we
run into the problem.

I'm not depending on walk_page_range(). I'm just trying to survive the
case where it is scanning a VM_PFNMAP range.

> I recommend that you check VM_PFNMAP in the possible callers' side.
> But this patch seems to solve your problem, so with properly commenting
> this somewhere, I do not oppose it.

Agreed, it could be handled by checking at several points higher up. But
checking at this common point seems more straightforward to me.

-Cliff
> 
> Thanks,
> Naoya Horiguchi

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-02 Thread Cliff Wickman

/proc//smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

This is v2: 
- moves the VM_BUG_ON out of the loop
- adds the needed test for  vma->vm_start <= addr

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman 
---
 mm/pagewalk.c |   62 ++
 1 file changed, 33 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk->hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
-   vma = find_vma(walk->mm, addr);
-   if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -198,30 +182,50 @@ int walk_page_range(unsigned long addr,
if (!walk->mm)
return -EINVAL;
 
+   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
+
pgd = pgd_offset(walk->mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* Check any special vma's within this range.
 */
-   vma = hugetlb_vma(addr, walk);
+   vma = find_vma(walk->mm, addr);
if (vma) {
-   if (vma->vm_end < next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so do not allow split_huge_page_pmd().
+*/
+   if ((vma->vm_start <= addr) &&
+   (vma->vm_flags & VM_PFNMAP)) {
next = vma->vm_end;
+   pgd = pgd_offset(walk->mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk->mm, next);
-   continue;
+   if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+   is_vm_hugetlb_page(vma)) {
+   if (vma->vm_end < next)
+   next = vma->vm_end;
+   /*
+* Hugepage is very tightly coupled with vma,
+* so walk through hugetlb entries within a
+* given vma.
+*/
+   err = walk

[PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-02 Thread Cliff Wickman

/proc/pid/smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

This is v2: 
- moves the VM_BUG_ON out of the loop
- adds the needed test for  vma-vm_start = addr

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman c...@sgi.com
---
 mm/pagewalk.c |   62 ++
 1 file changed, 33 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk-hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
-   vma = find_vma(walk-mm, addr);
-   if (vma  vma-vm_start = addr  is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -198,30 +182,50 @@ int walk_page_range(unsigned long addr,
if (!walk-mm)
return -EINVAL;
 
+   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
+
pgd = pgd_offset(walk-mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* Check any special vma's within this range.
 */
-   vma = hugetlb_vma(addr, walk);
+   vma = find_vma(walk-mm, addr);
if (vma) {
-   if (vma-vm_end  next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so do not allow split_huge_page_pmd().
+*/
+   if ((vma-vm_start = addr) 
+   (vma-vm_flags  VM_PFNMAP)) {
next = vma-vm_end;
+   pgd = pgd_offset(walk-mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk-mm, next);
-   continue;
+   if (walk-hugetlb_entry  (vma-vm_start = addr) 
+   is_vm_hugetlb_page(vma)) {
+   if (vma-vm_end  next)
+   next = vma-vm_end;
+   /*
+* Hugepage is very tightly coupled with vma,
+* so walk through hugetlb entries within a
+* given vma.
+*/
+   err = walk_hugetlb_range(vma, addr, next, walk);
+   if (err)
+   break

Re: [PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-02 Thread Cliff Wickman
On Thu, May 02, 2013 at 12:44:04PM -0400, Naoya Horiguchi wrote:
 On Thu, May 02, 2013 at 07:10:48AM -0500, Cliff Wickman wrote:
  
  /proc/pid/smaps and similar walks through a user page table should not
  be looking at VM_PFNMAP areas.
  
  This is v2: 
  - moves the VM_BUG_ON out of the loop
  - adds the needed test for  vma-vm_start = addr
  
  Certain tests in walk_page_range() (specifically split_huge_page_pmd())
  assume that all the mapped PFN's are backed with page structures. And this 
  is
  not usually true for VM_PFNMAP areas. This can result in panics on kernel
  page faults when attempting to address those page structures.
  
  There are a half dozen callers of walk_page_range() that walk through
  a task's entire page table (as N. Horiguchi pointed out). So rather than
  change all of them, this patch changes just walk_page_range() to ignore 
  VM_PFNMAP areas.
  
  The logic of hugetlb_vma() is moved back into walk_page_range(), as we
  want to test any vma in the range.
  
  VM_PFNMAP areas are used by:
  - graphics memory manager   gpu/drm/drm_gem.c
  - global reference unit sgi-gru/grufile.c
  - sgi special memorychar/mspec.c
  - and probably several out-of-tree modules
  
  I'm copying everyone who has changed this file recently, in case
  there is some reason that I am not aware of to provide
  /proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.
  
  Signed-off-by: Cliff Wickman c...@sgi.com
 
 walk_page_range() does vma-based walk only for address ranges backed by
 hugetlbfs, and it doesn't see vma for address ranges backed by normal pages
 and thps (in those case we just walk over page table hierarchy).

Agreed, walk_page_range() only checks for a hugetlbfs-type vma as it
scans an address range.

The problem I'm seeing comes in when it calls walk_pud_range() for any address
range that is not within a hugetlbfs vma:
   walk_pmd_range()
 split_huge_page_pmd_mm()
   split_huge_page_pmd()
 __split_huge_page_pmd()
   page = pmd_page(*pmd)
And such a page structure does not exist for a VM_PFNMAP area.
 
 I think that vma-based walk was introduced as a kind of dirty hack to
 handle hugetlbfs, and it can be cleaned up in the future. So I'm afraid
 it's not a good idea to extend or adding code heavily depending on this hack.

walk_page_range() looks like generic infrastructure to scan any range
of a user's address space - as in /proc/pid/smaps and similar. And the
hugetlbfs check seems to have been added as an exception.  
Huge page exceptional cases occur further down the chain.  And
when a corresponding page structure is needed for those cases we
run into the problem.

I'm not depending on walk_page_range(). I'm just trying to survive the
case where it is scanning a VM_PFNMAP range.

 I recommend that you check VM_PFNMAP in the possible callers' side.
 But this patch seems to solve your problem, so with properly commenting
 this somewhere, I do not oppose it.

Agreed, it could be handled by checking at several points higher up. But
checking at this common point seems more straightforward to me.

-Cliff
 
 Thanks,
 Naoya Horiguchi

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-01 Thread Cliff Wickman
On Wed, May 01, 2013 at 08:47:02AM -0700, David Rientjes wrote:
> On Wed, 1 May 2013, Cliff Wickman wrote:
> 
> > Index: linux/mm/pagewalk.c
> > ===
> > --- linux.orig/mm/pagewalk.c
> > +++ linux/mm/pagewalk.c
> > @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
> > return 0;
> >  }
> >  
> > -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct 
> > mm_walk *walk)
> > -{
> > -   struct vm_area_struct *vma;
> > -
> > -   /* We don't need vma lookup at all. */
> > -   if (!walk->hugetlb_entry)
> > -   return NULL;
> > -
> > -   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
> > -   vma = find_vma(walk->mm, addr);
> > -   if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
> > -   return vma;
> > -
> > -   return NULL;
> > -}
> > -
> >  #else /* CONFIG_HUGETLB_PAGE */
> >  static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct 
> > mm_walk *walk)
> >  {
> > @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr,
> >  
> > pgd = pgd_offset(walk->mm, addr);
> > do {
> > -   struct vm_area_struct *vma;
> > +   struct vm_area_struct *vma = NULL;
> >  
> > next = pgd_addr_end(addr, end);
> >  
> > /*
> > -* handle hugetlb vma individually because pagetable walk for
> > -* the hugetlb page is dependent on the architecture and
> > -* we can't handled it in the same manner as non-huge pages.
> > +* Check any special vma's within this range.
> >  */
> > -   vma = hugetlb_vma(addr, walk);
> > +   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
> 
> I think this should be moved out of the iteration.  It's currently inside 
> it even before your patch, but I think it's pointless.

I don't follow.  We are iterating through a range of addresses.  When
we come to a range that is VM_PFNMAP we skip it.  How can we take that
out of the iteration?
 
> > +   vma = find_vma(walk->mm, addr);
> > if (vma) {
> > -   if (vma->vm_end < next)
> > +   /*
> > +* There are no page structures backing a VM_PFNMAP
> > +* range, so allow no split_huge_page_pmd().
> > +*/
> > +   if (vma->vm_flags & VM_PFNMAP) {
> > next = vma->vm_end;
> > +   pgd = pgd_offset(walk->mm, next);
> > +   continue;
> > +   }
> 
> What if end < vma->vm_end?

Yes, a bad omission.  Thanks for pointing that out.
It should be if ((vma->vm_start <= addr) && (vma->vm_flags & VM_PFNMAP))
as find_vma can return a vma above the addr.

-Cliff
> > /*
> > -* Hugepage is very tightly coupled with vma, so
> > -* walk through hugetlb entries within a given vma.
> > +* Handle hugetlb vma individually because pagetable
> > +* walk for the hugetlb page is dependent on the
> > +* architecture and we can't handled it in the same
> > +* manner as non-huge pages.
> >  */
> > -   err = walk_hugetlb_range(vma, addr, next, walk);
> > -   if (err)
> > -   break;
> > -   pgd = pgd_offset(walk->mm, next);
> > -   continue;
> > +   if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
> > +   is_vm_hugetlb_page(vma)) {
> > +   if (vma->vm_end < next)
> > +   next = vma->vm_end;
> > +   /*
> > +        * Hugepage is very tightly coupled with vma,
> > +* so walk through hugetlb entries within a
> > +* given vma.
> > +*/
> > +   err = walk_hugetlb_range(vma, addr, next, walk);
> > +   if (err)
> > +   break;
> > +   pgd = pgd_offset(walk->mm, next);
> > +   continue;
> > +   }
> > }
> >  
> > if (pgd_none_or_clear_bad(pgd)) {

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-01 Thread Cliff Wickman

This patch replaces "[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas".
/proc//smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman 
---
 mm/pagewalk.c |   60 +-
 1 file changed, 31 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk->hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
-   vma = find_vma(walk->mm, addr);
-   if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr,
 
pgd = pgd_offset(walk->mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* Check any special vma's within this range.
 */
-   vma = hugetlb_vma(addr, walk);
+   VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem));
+   vma = find_vma(walk->mm, addr);
if (vma) {
-   if (vma->vm_end < next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so allow no split_huge_page_pmd().
+*/
+   if (vma->vm_flags & VM_PFNMAP) {
next = vma->vm_end;
+   pgd = pgd_offset(walk->mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk->mm, next);
-   continue;
+   if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+   is_vm_hugetlb_page(vma)) {
+   if (vma->vm_end < next)
+   next = vma->vm_end;
+   /*
+* Hugepage is very tightly coupled with vma,
+* so walk through hugetlb entries within a
+* given vma.
+*/
+   err = walk_hugetlb_range(vma, addr, next, walk);
+   if (err)
+   b

[PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-01 Thread Cliff Wickman

This patch replaces [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas.
/proc/pid/smaps and similar walks through a user page table should not
be looking at VM_PFNMAP areas.

Certain tests in walk_page_range() (specifically split_huge_page_pmd())
assume that all the mapped PFN's are backed with page structures. And this is
not usually true for VM_PFNMAP areas. This can result in panics on kernel
page faults when attempting to address those page structures.

There are a half dozen callers of walk_page_range() that walk through
a task's entire page table (as N. Horiguchi pointed out). So rather than
change all of them, this patch changes just walk_page_range() to ignore 
VM_PFNMAP areas.

The logic of hugetlb_vma() is moved back into walk_page_range(), as we
want to test any vma in the range.

VM_PFNMAP areas are used by:
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- and probably several out-of-tree modules

I'm copying everyone who has changed this file recently, in case
there is some reason that I am not aware of to provide
/proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas.

Signed-off-by: Cliff Wickman c...@sgi.com
---
 mm/pagewalk.c |   60 +-
 1 file changed, 31 insertions(+), 29 deletions(-)

Index: linux/mm/pagewalk.c
===
--- linux.orig/mm/pagewalk.c
+++ linux/mm/pagewalk.c
@@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
return 0;
 }
 
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
-{
-   struct vm_area_struct *vma;
-
-   /* We don't need vma lookup at all. */
-   if (!walk-hugetlb_entry)
-   return NULL;
-
-   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
-   vma = find_vma(walk-mm, addr);
-   if (vma  vma-vm_start = addr  is_vm_hugetlb_page(vma))
-   return vma;
-
-   return NULL;
-}
-
 #else /* CONFIG_HUGETLB_PAGE */
 static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk 
*walk)
 {
@@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr,
 
pgd = pgd_offset(walk-mm, addr);
do {
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma = NULL;
 
next = pgd_addr_end(addr, end);
 
/*
-* handle hugetlb vma individually because pagetable walk for
-* the hugetlb page is dependent on the architecture and
-* we can't handled it in the same manner as non-huge pages.
+* Check any special vma's within this range.
 */
-   vma = hugetlb_vma(addr, walk);
+   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
+   vma = find_vma(walk-mm, addr);
if (vma) {
-   if (vma-vm_end  next)
+   /*
+* There are no page structures backing a VM_PFNMAP
+* range, so allow no split_huge_page_pmd().
+*/
+   if (vma-vm_flags  VM_PFNMAP) {
next = vma-vm_end;
+   pgd = pgd_offset(walk-mm, next);
+   continue;
+   }
/*
-* Hugepage is very tightly coupled with vma, so
-* walk through hugetlb entries within a given vma.
+* Handle hugetlb vma individually because pagetable
+* walk for the hugetlb page is dependent on the
+* architecture and we can't handled it in the same
+* manner as non-huge pages.
 */
-   err = walk_hugetlb_range(vma, addr, next, walk);
-   if (err)
-   break;
-   pgd = pgd_offset(walk-mm, next);
-   continue;
+   if (walk-hugetlb_entry  (vma-vm_start = addr) 
+   is_vm_hugetlb_page(vma)) {
+   if (vma-vm_end  next)
+   next = vma-vm_end;
+   /*
+* Hugepage is very tightly coupled with vma,
+* so walk through hugetlb entries within a
+* given vma.
+*/
+   err = walk_hugetlb_range(vma, addr, next, walk);
+   if (err)
+   break;
+   pgd = pgd_offset(walk-mm, next);
+   continue

Re: [PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas

2013-05-01 Thread Cliff Wickman
On Wed, May 01, 2013 at 08:47:02AM -0700, David Rientjes wrote:
 On Wed, 1 May 2013, Cliff Wickman wrote:
 
  Index: linux/mm/pagewalk.c
  ===
  --- linux.orig/mm/pagewalk.c
  +++ linux/mm/pagewalk.c
  @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_
  return 0;
   }
   
  -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct 
  mm_walk *walk)
  -{
  -   struct vm_area_struct *vma;
  -
  -   /* We don't need vma lookup at all. */
  -   if (!walk-hugetlb_entry)
  -   return NULL;
  -
  -   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
  -   vma = find_vma(walk-mm, addr);
  -   if (vma  vma-vm_start = addr  is_vm_hugetlb_page(vma))
  -   return vma;
  -
  -   return NULL;
  -}
  -
   #else /* CONFIG_HUGETLB_PAGE */
   static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct 
  mm_walk *walk)
   {
  @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr,
   
  pgd = pgd_offset(walk-mm, addr);
  do {
  -   struct vm_area_struct *vma;
  +   struct vm_area_struct *vma = NULL;
   
  next = pgd_addr_end(addr, end);
   
  /*
  -* handle hugetlb vma individually because pagetable walk for
  -* the hugetlb page is dependent on the architecture and
  -* we can't handled it in the same manner as non-huge pages.
  +* Check any special vma's within this range.
   */
  -   vma = hugetlb_vma(addr, walk);
  +   VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem));
 
 I think this should be moved out of the iteration.  It's currently inside 
 it even before your patch, but I think it's pointless.

I don't follow.  We are iterating through a range of addresses.  When
we come to a range that is VM_PFNMAP we skip it.  How can we take that
out of the iteration?
 
  +   vma = find_vma(walk-mm, addr);
  if (vma) {
  -   if (vma-vm_end  next)
  +   /*
  +* There are no page structures backing a VM_PFNMAP
  +* range, so allow no split_huge_page_pmd().
  +*/
  +   if (vma-vm_flags  VM_PFNMAP) {
  next = vma-vm_end;
  +   pgd = pgd_offset(walk-mm, next);
  +   continue;
  +   }
 
 What if end  vma-vm_end?

Yes, a bad omission.  Thanks for pointing that out.
It should be if ((vma-vm_start = addr)  (vma-vm_flags  VM_PFNMAP))
as find_vma can return a vma above the addr.

-Cliff
  /*
  -* Hugepage is very tightly coupled with vma, so
  -* walk through hugetlb entries within a given vma.
  +* Handle hugetlb vma individually because pagetable
  +* walk for the hugetlb page is dependent on the
  +* architecture and we can't handled it in the same
  +* manner as non-huge pages.
   */
  -   err = walk_hugetlb_range(vma, addr, next, walk);
  -   if (err)
  -   break;
  -   pgd = pgd_offset(walk-mm, next);
  -   continue;
  +   if (walk-hugetlb_entry  (vma-vm_start = addr) 
  +   is_vm_hugetlb_page(vma)) {
  +   if (vma-vm_end  next)
  +   next = vma-vm_end;
  +   /*
  +* Hugepage is very tightly coupled with vma,
  +* so walk through hugetlb entries within a
  +* given vma.
  +*/
  +   err = walk_hugetlb_range(vma, addr, next, walk);
  +   if (err)
  +   break;
  +   pgd = pgd_offset(walk-mm, next);
  +   continue;
  +   }
  }
   
  if (pgd_none_or_clear_bad(pgd)) {

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas

2013-04-30 Thread Cliff Wickman
> On Tue, Apr 30, 2013 at 01:11:45PM -0500, Naoya Horiguchi wrote:
> > 
> > /proc//smaps should not be looking at VM_PFNMAP areas.
> > 
> > Certain tests in show_smap() (especially for huge pages) assume that the
> > mapped PFN's are backed with page structures.  And this is not usually true
> > for VM_PFNMAP areas.  This can result in panics on kernel page faults when
> > attempting to address those page structures.
> 
> I think it's strange that you mention to hugepages, because in my 
> understanding
> VM_PFNMAP and hugepage related vma (VM_HUGEPAGE or VM_HUGETLB) should not set
> at the same time. In what testcase are these flags both set?

I don't think VM_PFNMAP and VM_HUGE... set at the same time.
The problem is that a VM_PFNMAP'd area might have 2MB mappings in its
page table, but they may point to pfn's that are not backed by page
structures.

Then a sequence like:
show_smap
  show_map_vma
walk_page_range
  walk_pud_range
walk_pmd_range
  split_huge_page_pmd(walk->mm, pmd)
__split_huge_page_pmd
  page = pmd_page(*pmd)
can address   (vmemmap + (pfn)) and panic

Or a sequence like this:
walk_pmd_range
  walk->pmd_entry(pmd, addr, next, walk)
  smaps_pte_range
smaps_pte_entry(*pte, addr, PAGE_SIZE, walk)
  page = vm_normal_page(vma, addr, ptent)
return pfn_to_page(pfn)
> 
> And I guess this race can also happen on reading pagemap or numa_maps because
> walk_page_range() is called in those code paths. Are you sure the race doesn't
> happen on these paths? If not, please add a few more flag checks for them.

Okay.  I'll check and submit a version 2 of this patch.

-Cliff Wickman
 
> Thanks,
> Naoya Horiguchi
> 
>  
> > VM_PFNMAP areas are used by
> > - graphics memory manager   gpu/drm/drm_gem.c
> > - global reference unit sgi-gru/grufile.c
> > - sgi special memorychar/mspec.c
> > - probably several out-of-tree modules
> > 
> > I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case
> > of some reason to provide /proc//smaps for these areas that I am not
> > aware of.
> > 
> > Signed-off-by: Cliff Wickman 
> > ---
> >  fs/proc/task_mmu.c |3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > Index: linux/fs/proc/task_mmu.c
> > ===
> > --- linux.orig/fs/proc/task_mmu.c
> > +++ linux/fs/proc/task_mmu.c
> > @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m,
> >     .private = ,
> > };
> >  
> > +   if (vma->vm_flags & VM_PFNMAP)
> > +   return 0;
> > +
> > memset(, 0, sizeof mss);
> > mss.vma = vma;
> > /* mmap_sem is held in m_start */

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas

2013-04-30 Thread Cliff Wickman
 On Tue, Apr 30, 2013 at 01:11:45PM -0500, Naoya Horiguchi wrote:
  
  /proc/pid/smaps should not be looking at VM_PFNMAP areas.
  
  Certain tests in show_smap() (especially for huge pages) assume that the
  mapped PFN's are backed with page structures.  And this is not usually true
  for VM_PFNMAP areas.  This can result in panics on kernel page faults when
  attempting to address those page structures.
 
 I think it's strange that you mention to hugepages, because in my 
 understanding
 VM_PFNMAP and hugepage related vma (VM_HUGEPAGE or VM_HUGETLB) should not set
 at the same time. In what testcase are these flags both set?

I don't think VM_PFNMAP and VM_HUGE... set at the same time.
The problem is that a VM_PFNMAP'd area might have 2MB mappings in its
page table, but they may point to pfn's that are not backed by page
structures.

Then a sequence like:
show_smap
  show_map_vma
walk_page_range
  walk_pud_range
walk_pmd_range
  split_huge_page_pmd(walk-mm, pmd)
__split_huge_page_pmd
  page = pmd_page(*pmd)
can address   (vmemmap + (pfn)) and panic

Or a sequence like this:
walk_pmd_range
  walk-pmd_entry(pmd, addr, next, walk)
  smaps_pte_range
smaps_pte_entry(*pte, addr, PAGE_SIZE, walk)
  page = vm_normal_page(vma, addr, ptent)
return pfn_to_page(pfn)
 
 And I guess this race can also happen on reading pagemap or numa_maps because
 walk_page_range() is called in those code paths. Are you sure the race doesn't
 happen on these paths? If not, please add a few more flag checks for them.

Okay.  I'll check and submit a version 2 of this patch.

-Cliff Wickman
 
 Thanks,
 Naoya Horiguchi
 
  
  VM_PFNMAP areas are used by
  - graphics memory manager   gpu/drm/drm_gem.c
  - global reference unit sgi-gru/grufile.c
  - sgi special memorychar/mspec.c
  - probably several out-of-tree modules
  
  I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case
  of some reason to provide /proc/pid/smaps for these areas that I am not
  aware of.
  
  Signed-off-by: Cliff Wickman c...@sgi.com
  ---
   fs/proc/task_mmu.c |3 +++
   1 file changed, 3 insertions(+)
  
  Index: linux/fs/proc/task_mmu.c
  ===
  --- linux.orig/fs/proc/task_mmu.c
  +++ linux/fs/proc/task_mmu.c
  @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m,
  .private = mss,
  };
   
  +   if (vma-vm_flags  VM_PFNMAP)
  +   return 0;
  +
  memset(mss, 0, sizeof mss);
  mss.vma = vma;
  /* mmap_sem is held in m_start */

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas

2013-04-29 Thread Cliff Wickman

/proc//smaps should not be looking at VM_PFNMAP areas.

Certain tests in show_smap() (especially for huge pages) assume that the
mapped PFN's are backed with page structures.  And this is not usually true
for VM_PFNMAP areas.  This can result in panics on kernel page faults when
attempting to address those page structures.

VM_PFNMAP areas are used by
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- probably several out-of-tree modules

I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case
of some reason to provide /proc//smaps for these areas that I am not
aware of.

Signed-off-by: Cliff Wickman 
---
 fs/proc/task_mmu.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux/fs/proc/task_mmu.c
===
--- linux.orig/fs/proc/task_mmu.c
+++ linux/fs/proc/task_mmu.c
@@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m,
.private = ,
};
 
+   if (vma->vm_flags & VM_PFNMAP)
+   return 0;
+
memset(, 0, sizeof mss);
mss.vma = vma;
/* mmap_sem is held in m_start */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas

2013-04-29 Thread Cliff Wickman

/proc/pid/smaps should not be looking at VM_PFNMAP areas.

Certain tests in show_smap() (especially for huge pages) assume that the
mapped PFN's are backed with page structures.  And this is not usually true
for VM_PFNMAP areas.  This can result in panics on kernel page faults when
attempting to address those page structures.

VM_PFNMAP areas are used by
- graphics memory manager   gpu/drm/drm_gem.c
- global reference unit sgi-gru/grufile.c
- sgi special memorychar/mspec.c
- probably several out-of-tree modules

I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case
of some reason to provide /proc/pid/smaps for these areas that I am not
aware of.

Signed-off-by: Cliff Wickman c...@sgi.com
---
 fs/proc/task_mmu.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux/fs/proc/task_mmu.c
===
--- linux.orig/fs/proc/task_mmu.c
+++ linux/fs/proc/task_mmu.c
@@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m,
.private = mss,
};
 
+   if (vma-vm_flags  VM_PFNMAP)
+   return 0;
+
memset(mss, 0, sizeof mss);
mss.vma = vma;
/* mmap_sem is held in m_start */
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 0/8] kdump, vmcore: support mmap() on /proc/vmcore

2013-04-25 Thread Cliff Wickman
On Fri, Apr 05, 2013 at 12:04:02AM +, HATAYAMA Daisuke wrote:
> Currently, read to /proc/vmcore is done by read_oldmem() that uses
> ioremap/iounmap per a single page. For example, if memory is 1GB,
> ioremap/iounmap is called (1GB / 4KB)-times, that is, 262144
> times. This causes big performance degradation.
> 
> In particular, the current main user of this mmap() is makedumpfile,
> which not only reads memory from /proc/vmcore but also does other
> processing like filtering, compression and IO work.
> 
> To address the issue, this patch implements mmap() on /proc/vmcore to
> improve read performance.
> 
> Benchmark
> =
> 
> You can see two benchmarks on terabyte memory system. Both show about
> 40 seconds on 2TB system. This is almost equal to performance by
> experimtanal kernel-side memory filtering.
> 
> - makedumpfile mmap() benchmark, by Jingbai Ma
>   https://lkml.org/lkml/2013/3/27/19
> 
> - makedumpfile: benchmark on mmap() with /proc/vmcore on 2TB memory system
>   https://lkml.org/lkml/2013/3/26/914
> 
> ChangeLog
> =
> 
> v3 => v4)
> 
> - Rebase 3.9-rc7.
> - Drop clean-up patches orthogonal to the main topic of this patch set.
> - Copy ELF note segments in the 1st kernel just as in v1. Allocate
>   vmcore objects per pages. => See [PATCH 5/8]
> - Map memory referenced by PT_LOAD entry directly even if the start or
>   end of the region doesn't fit inside page boundary, no longer copy
>   them as the previous v3. Then, holes, outside OS memory, are visible
>   from /proc/vmcore. => See [PATCH 7/8]
> 
> v2 => v3)
> 
> - Rebase 3.9-rc3.
> - Copy program headers seprately from e_phoff in ELF note segment
>   buffer. Now there's no risk to allocate huge memory if program
>   header table positions after memory segment.
> - Add cleanup patch that removes unnecessary variable.
> - Fix wrongly using the variable that is buffer size configurable at
>   runtime. Instead, use the varibale that has original buffer size.
> 
> v1 => v2)
> 
> - Clean up the existing codes: use e_phoff, and remove the assumption
>   on PT_NOTE entries.
> - Fix potencial bug that ELF haeader size is not included in exported
>   vmcoreinfo size.
> - Divide patch modifying read_vmcore() into two: clean-up and primary
>   code change.
> - Put ELF note segments in page-size boundary on the 1st kernel
>   instead of copying them into the buffer on the 2nd kernel.
> 
> Test
> 
> 
> This patch set is composed based on v3.9-rc7.
> 
> Done on x86-64, x86-32 both with 1GB and over 4GB memory environments.
> 
> ---
> 
> HATAYAMA Daisuke (8):
>   vmcore: support mmap() on /proc/vmcore
>   vmcore: treat memory chunks referenced by PT_LOAD program header 
> entries in \
> page-size boundary in vmcore_list
>   vmcore: count holes generated by round-up operation for page boudary 
> for size \
> of /proc/vmcore
>   vmcore: copy ELF note segments in the 2nd kernel per page vmcore objects
>   vmcore: Add helper function vmcore_add()
>   vmcore, procfs: introduce MEM_TYPE_CURRENT_KERNEL flag to distinguish 
> objects \
> copied in 2nd kernel  vmcore: clean up read_vmcore()
>   vmcore: allocate buffer for ELF headers on page-size alignment
> 
> 
>  fs/proc/vmcore.c|  349 
> ---
>  include/linux/proc_fs.h |8 +
>  2 files changed, 245 insertions(+), 112 deletions(-)
> 
> -- 
> 
> Thanks.
> HATAYAMA, Daisuke

This is a very important patch set for speeding the kdump process.
(patches 1 - 8)

We have found the mmap interface to /proc/vmcore about 80x faster than the 
read interface.
That is, doing mmap's and copying data (in pieces the size of page
structures) transfers all of /proc/vmcore about 80 times faster than
reading it.

This greatly speeds up the capture of a kdump, as the scan of page
structures takes the bulk of the time in dumping the OS on a machine
with terabytes of memory.

We would very much like to see this set make it into the 3.10 release.

Acked-by: Cliff Wickman 

-Cliff
-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 0/8] kdump, vmcore: support mmap() on /proc/vmcore

2013-04-25 Thread Cliff Wickman
On Fri, Apr 05, 2013 at 12:04:02AM +, HATAYAMA Daisuke wrote:
 Currently, read to /proc/vmcore is done by read_oldmem() that uses
 ioremap/iounmap per a single page. For example, if memory is 1GB,
 ioremap/iounmap is called (1GB / 4KB)-times, that is, 262144
 times. This causes big performance degradation.
 
 In particular, the current main user of this mmap() is makedumpfile,
 which not only reads memory from /proc/vmcore but also does other
 processing like filtering, compression and IO work.
 
 To address the issue, this patch implements mmap() on /proc/vmcore to
 improve read performance.
 
 Benchmark
 =
 
 You can see two benchmarks on terabyte memory system. Both show about
 40 seconds on 2TB system. This is almost equal to performance by
 experimtanal kernel-side memory filtering.
 
 - makedumpfile mmap() benchmark, by Jingbai Ma
   https://lkml.org/lkml/2013/3/27/19
 
 - makedumpfile: benchmark on mmap() with /proc/vmcore on 2TB memory system
   https://lkml.org/lkml/2013/3/26/914
 
 ChangeLog
 =
 
 v3 = v4)
 
 - Rebase 3.9-rc7.
 - Drop clean-up patches orthogonal to the main topic of this patch set.
 - Copy ELF note segments in the 1st kernel just as in v1. Allocate
   vmcore objects per pages. = See [PATCH 5/8]
 - Map memory referenced by PT_LOAD entry directly even if the start or
   end of the region doesn't fit inside page boundary, no longer copy
   them as the previous v3. Then, holes, outside OS memory, are visible
   from /proc/vmcore. = See [PATCH 7/8]
 
 v2 = v3)
 
 - Rebase 3.9-rc3.
 - Copy program headers seprately from e_phoff in ELF note segment
   buffer. Now there's no risk to allocate huge memory if program
   header table positions after memory segment.
 - Add cleanup patch that removes unnecessary variable.
 - Fix wrongly using the variable that is buffer size configurable at
   runtime. Instead, use the varibale that has original buffer size.
 
 v1 = v2)
 
 - Clean up the existing codes: use e_phoff, and remove the assumption
   on PT_NOTE entries.
 - Fix potencial bug that ELF haeader size is not included in exported
   vmcoreinfo size.
 - Divide patch modifying read_vmcore() into two: clean-up and primary
   code change.
 - Put ELF note segments in page-size boundary on the 1st kernel
   instead of copying them into the buffer on the 2nd kernel.
 
 Test
 
 
 This patch set is composed based on v3.9-rc7.
 
 Done on x86-64, x86-32 both with 1GB and over 4GB memory environments.
 
 ---
 
 HATAYAMA Daisuke (8):
   vmcore: support mmap() on /proc/vmcore
   vmcore: treat memory chunks referenced by PT_LOAD program header 
 entries in \
 page-size boundary in vmcore_list
   vmcore: count holes generated by round-up operation for page boudary 
 for size \
 of /proc/vmcore
   vmcore: copy ELF note segments in the 2nd kernel per page vmcore objects
   vmcore: Add helper function vmcore_add()
   vmcore, procfs: introduce MEM_TYPE_CURRENT_KERNEL flag to distinguish 
 objects \
 copied in 2nd kernel  vmcore: clean up read_vmcore()
   vmcore: allocate buffer for ELF headers on page-size alignment
 
 
  fs/proc/vmcore.c|  349 
 ---
  include/linux/proc_fs.h |8 +
  2 files changed, 245 insertions(+), 112 deletions(-)
 
 -- 
 
 Thanks.
 HATAYAMA, Daisuke

This is a very important patch set for speeding the kdump process.
(patches 1 - 8)

We have found the mmap interface to /proc/vmcore about 80x faster than the 
read interface.
That is, doing mmap's and copying data (in pieces the size of page
structures) transfers all of /proc/vmcore about 80 times faster than
reading it.

This greatly speeds up the capture of a kdump, as the scan of page
structures takes the bulk of the time in dumping the OS on a machine
with terabytes of memory.

We would very much like to see this set make it into the 3.10 release.

Acked-by: Cliff Wickman c...@sgi.com

-Cliff
-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-04-04 Thread Cliff Wickman
On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote:
> On 03/07/2013 05:50 AM, Cliff Wickman wrote:
>> From: Cliff Wickman 
>>
>> Allocating a large number of 1GB hugetlbfs pages at boot takes a
>> very long time.
>>
>> Large system sites would at times like to allocate a very large amount of
>> memory as 1GB pages.  They would put this on the kernel boot line:
>> default_hugepagesz=1G hugepagesz=1G hugepages=4096
>> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>>   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>>
>> Each page is zeroed as it is allocated, and all allocation is done by
>> cpu 0, as this path is early in boot:
>
> How you confirm they are done by cpu 0? just cpu 0 works during boot?

Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just
before the call to smp_init().  It is smp_init() that starts the other
cpus.  They don't come out of reset until then.

>>start_kernel
>>  kernel_init
>>do_pre_smp_initcalls
>>  hugetlb_init
>>hugetlb_init_hstates
>>  hugetlb_hstate_alloc_pages
>>
>> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
>> on large numa systems).
>> This estimate is approximate (it depends on core frequency & number of hops
>> to remote memory) but should be within a factor of 2 on most systems.
>> A benchmark attempting to reserve a TB for 1GB pages would thus require
>> ~1000 seconds of boot time just for this allocating.  32TB would take 8 
>> hours.
>>
>> I propose passing a flag to the early allocator to indicate that no zeroing
>> of a page should be done.  The 'no zeroing' flag would have to be passed
>> down this code path:
>>
>>hugetlb_hstate_alloc_pages
>>  alloc_bootmem_huge_page
>>__alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
>>  __alloc_memory_core_early  NO_ZERO
>>if (!(flags & NO_ZERO))
>>  memset(ptr, 0, size);
>>
>> Or this path if CONFIG_NO_BOOTMEM is not set:
>>
>>hugetlb_hstate_alloc_pages
>>  alloc_bootmem_huge_page
>>__alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
>>  alloc_bootmem_core  NO_ZERO
>>if (!(flags & NO_ZERO))
>>  memset(region, 0, size);
>>  __alloc_bootmem_nopanic NO_ZERO
>>___alloc_bootmem_nopanic  NO_ZERO
>>  alloc_bootmem_core  NO_ZERO
>>if (!(flags & NO_ZERO))
>>  memset(region, 0, size);
>>
>> Signed-off-by: Cliff Wickman 
>>
>> ---
>>   arch/x86/kernel/setup_percpu.c |4 ++--
>>   include/linux/bootmem.h|   23 ---
>>   mm/bootmem.c   |   12 +++-
>>   mm/hugetlb.c   |3 ++-
>>   mm/nobootmem.c |   41 
>> +++--
>>   mm/page_cgroup.c   |2 +-
>>   mm/sparse.c|2 +-
>>   7 files changed, 52 insertions(+), 35 deletions(-)
>>
>> Index: linux/include/linux/bootmem.h
>> ===
>> --- linux.orig/include/linux/bootmem.h
>> +++ linux/include/linux/bootmem.h
>> @@ -8,6 +8,11 @@
>>   #include 
>> /*
>> + * allocation flags
>> + */
>> +#define NO_ZERO 0x0001
>> +
>> +/*
>>*  simple boot-time physical memory area allocator.
>>*/
>>   @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>>   unsigned long goal);
>>   extern void *__alloc_bootmem_nopanic(unsigned long size,
>>   unsigned long align,
>> - unsigned long goal);
>> + unsigned long goal,
>> + u32 flags);
>>   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>>unsigned long size,
>>unsigned long align,
>> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>>   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>>unsigned long size,
>>unsigned long align,
>> -  unsigned long goal);
>> +  unsigned long goal,
>> +  u32 flags);
>

Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-04-04 Thread Cliff Wickman
On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote:
 On 03/07/2013 05:50 AM, Cliff Wickman wrote:
 From: Cliff Wickman c...@sgi.com

 Allocating a large number of 1GB hugetlbfs pages at boot takes a
 very long time.

 Large system sites would at times like to allocate a very large amount of
 memory as 1GB pages.  They would put this on the kernel boot line:
 default_hugepagesz=1G hugepagesz=1G hugepages=4096
 [Dynamic allocation of 1G pages is not an option, as zone pages only go
   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

 Each page is zeroed as it is allocated, and all allocation is done by
 cpu 0, as this path is early in boot:

 How you confirm they are done by cpu 0? just cpu 0 works during boot?

Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just
before the call to smp_init().  It is smp_init() that starts the other
cpus.  They don't come out of reset until then.

start_kernel
  kernel_init
do_pre_smp_initcalls
  hugetlb_init
hugetlb_init_hstates
  hugetlb_hstate_alloc_pages

 Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
 on large numa systems).
 This estimate is approximate (it depends on core frequency  number of hops
 to remote memory) but should be within a factor of 2 on most systems.
 A benchmark attempting to reserve a TB for 1GB pages would thus require
 ~1000 seconds of boot time just for this allocating.  32TB would take 8 
 hours.

 I propose passing a flag to the early allocator to indicate that no zeroing
 of a page should be done.  The 'no zeroing' flag would have to be passed
 down this code path:

hugetlb_hstate_alloc_pages
  alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
  __alloc_memory_core_early  NO_ZERO
if (!(flags  NO_ZERO))
  memset(ptr, 0, size);

 Or this path if CONFIG_NO_BOOTMEM is not set:

hugetlb_hstate_alloc_pages
  alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
  alloc_bootmem_core  NO_ZERO
if (!(flags  NO_ZERO))
  memset(region, 0, size);
  __alloc_bootmem_nopanic NO_ZERO
___alloc_bootmem_nopanic  NO_ZERO
  alloc_bootmem_core  NO_ZERO
if (!(flags  NO_ZERO))
  memset(region, 0, size);

 Signed-off-by: Cliff Wickman c...@sgi.com

 ---
   arch/x86/kernel/setup_percpu.c |4 ++--
   include/linux/bootmem.h|   23 ---
   mm/bootmem.c   |   12 +++-
   mm/hugetlb.c   |3 ++-
   mm/nobootmem.c |   41 
 +++--
   mm/page_cgroup.c   |2 +-
   mm/sparse.c|2 +-
   7 files changed, 52 insertions(+), 35 deletions(-)

 Index: linux/include/linux/bootmem.h
 ===
 --- linux.orig/include/linux/bootmem.h
 +++ linux/include/linux/bootmem.h
 @@ -8,6 +8,11 @@
   #include asm/dma.h
 /*
 + * allocation flags
 + */
 +#define NO_ZERO 0x0001
 +
 +/*
*  simple boot-time physical memory area allocator.
*/
   @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
   unsigned long goal);
   extern void *__alloc_bootmem_nopanic(unsigned long size,
   unsigned long align,
 - unsigned long goal);
 + unsigned long goal,
 + u32 flags);
   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
 @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
 -  unsigned long goal);
 +  unsigned long goal,
 +  u32 flags);
   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
 -  unsigned long limit);
 +  unsigned long limit,
 +  u32 flags);
   extern void *__alloc_bootmem_low(unsigned long size,
   unsigned long align,
   unsigned long goal);
 @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
   #define alloc_bootmem_align(x, align) \
  __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
   #define alloc_bootmem_nopanic(x) \
 -__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES

Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-03-11 Thread Cliff Wickman
On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
> On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman  wrote:
> > From: Cliff Wickman 
> >
> > Allocating a large number of 1GB hugetlbfs pages at boot takes a
> > very long time.
> >
> > Large system sites would at times like to allocate a very large amount of
> > memory as 1GB pages.  They would put this on the kernel boot line:
> >default_hugepagesz=1G hugepagesz=1G hugepages=4096
> > [Dynamic allocation of 1G pages is not an option, as zone pages only go
> >  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
> >
> > Each page is zeroed as it is allocated, and all allocation is done by
> > cpu 0, as this path is early in boot:
> >   start_kernel
> > kernel_init
> >   do_pre_smp_initcalls
> > hugetlb_init
> >   hugetlb_init_hstates
> > hugetlb_hstate_alloc_pages
> >
> > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> > on large numa systems).
> > This estimate is approximate (it depends on core frequency & number of hops
> > to remote memory) but should be within a factor of 2 on most systems.
> > A benchmark attempting to reserve a TB for 1GB pages would thus require
> > ~1000 seconds of boot time just for this allocating.  32TB would take 8 
> > hours.
> >
> > I propose passing a flag to the early allocator to indicate that no zeroing
> > of a page should be done.  The 'no zeroing' flag would have to be passed
> > down this code path:
> >
> 
> FYI: huge pages are cleared just after allocated, for instance,
> clear_huge_page() in hugetlb_no_page()
> 
> Hillf

Yes, I should have added that comment to the changelog.  And because
this is true there is no need to clear a huge page at boot time.

-Cliff
> >   hugetlb_hstate_alloc_pages
> > alloc_bootmem_huge_page
> >   __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
> > __alloc_memory_core_early  NO_ZERO
> >   if (!(flags & NO_ZERO))
> > memset(ptr, 0, size);
> >
> > Or this path if CONFIG_NO_BOOTMEM is not set:
> >
> >   hugetlb_hstate_alloc_pages
> > alloc_bootmem_huge_page
> >   __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
> > alloc_bootmem_core  NO_ZERO
> >   if (!(flags & NO_ZERO))
> > memset(region, 0, size);
> > __alloc_bootmem_nopanic NO_ZERO
> >   ___alloc_bootmem_nopanic  NO_ZERO
> > alloc_bootmem_core  NO_ZERO
> >   if (!(flags & NO_ZERO))
> > memset(region, 0, size);
> >
> > Signed-off-by: Cliff Wickman 
> >
> > ---
> >  arch/x86/kernel/setup_percpu.c |4 ++--
> >  include/linux/bootmem.h|   23 ---
> >  mm/bootmem.c   |   12 +++-
> >  mm/hugetlb.c   |3 ++-
> >  mm/nobootmem.c |   41 
> > +++--
> >  mm/page_cgroup.c   |2 +-
> >  mm/sparse.c|2 +-
> >  7 files changed, 52 insertions(+), 35 deletions(-)
> >
> > Index: linux/include/linux/bootmem.h
> > ===
> > --- linux.orig/include/linux/bootmem.h
> > +++ linux/include/linux/bootmem.h
> > @@ -8,6 +8,11 @@
> >  #include 
> >
> >  /*
> > + * allocation flags
> > + */
> > +#define NO_ZERO0x0001
> > +
> > +/*
> >   *  simple boot-time physical memory area allocator.
> >   */
> >
> > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> >  unsigned long goal);
> >  extern void *__alloc_bootmem_nopanic(unsigned long size,
> >  unsigned long align,
> > -unsigned long goal);
> > +unsigned long goal,
> > +u32 flags);
> >  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> >   unsigned long size,
> >   unsigned long align,
> > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> >  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >   unsigned long size,
> >   unsigned long align,
> > - 

Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-03-11 Thread Cliff Wickman
On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
 On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman c...@sgi.com wrote:
  From: Cliff Wickman c...@sgi.com
 
  Allocating a large number of 1GB hugetlbfs pages at boot takes a
  very long time.
 
  Large system sites would at times like to allocate a very large amount of
  memory as 1GB pages.  They would put this on the kernel boot line:
 default_hugepagesz=1G hugepagesz=1G hugepages=4096
  [Dynamic allocation of 1G pages is not an option, as zone pages only go
   up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
 
  Each page is zeroed as it is allocated, and all allocation is done by
  cpu 0, as this path is early in boot:
start_kernel
  kernel_init
do_pre_smp_initcalls
  hugetlb_init
hugetlb_init_hstates
  hugetlb_hstate_alloc_pages
 
  Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
  on large numa systems).
  This estimate is approximate (it depends on core frequency  number of hops
  to remote memory) but should be within a factor of 2 on most systems.
  A benchmark attempting to reserve a TB for 1GB pages would thus require
  ~1000 seconds of boot time just for this allocating.  32TB would take 8 
  hours.
 
  I propose passing a flag to the early allocator to indicate that no zeroing
  of a page should be done.  The 'no zeroing' flag would have to be passed
  down this code path:
 
 
 FYI: huge pages are cleared just after allocated, for instance,
 clear_huge_page() in hugetlb_no_page()
 
 Hillf

Yes, I should have added that comment to the changelog.  And because
this is true there is no need to clear a huge page at boot time.

-Cliff
hugetlb_hstate_alloc_pages
  alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
  __alloc_memory_core_early  NO_ZERO
if (!(flags  NO_ZERO))
  memset(ptr, 0, size);
 
  Or this path if CONFIG_NO_BOOTMEM is not set:
 
hugetlb_hstate_alloc_pages
  alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
  alloc_bootmem_core  NO_ZERO
if (!(flags  NO_ZERO))
  memset(region, 0, size);
  __alloc_bootmem_nopanic NO_ZERO
___alloc_bootmem_nopanic  NO_ZERO
  alloc_bootmem_core  NO_ZERO
if (!(flags  NO_ZERO))
  memset(region, 0, size);
 
  Signed-off-by: Cliff Wickman c...@sgi.com
 
  ---
   arch/x86/kernel/setup_percpu.c |4 ++--
   include/linux/bootmem.h|   23 ---
   mm/bootmem.c   |   12 +++-
   mm/hugetlb.c   |3 ++-
   mm/nobootmem.c |   41 
  +++--
   mm/page_cgroup.c   |2 +-
   mm/sparse.c|2 +-
   7 files changed, 52 insertions(+), 35 deletions(-)
 
  Index: linux/include/linux/bootmem.h
  ===
  --- linux.orig/include/linux/bootmem.h
  +++ linux/include/linux/bootmem.h
  @@ -8,6 +8,11 @@
   #include asm/dma.h
 
   /*
  + * allocation flags
  + */
  +#define NO_ZERO0x0001
  +
  +/*
*  simple boot-time physical memory area allocator.
*/
 
  @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
   unsigned long goal);
   extern void *__alloc_bootmem_nopanic(unsigned long size,
   unsigned long align,
  -unsigned long goal);
  +unsigned long goal,
  +u32 flags);
   extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
  @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
   extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
  - unsigned long goal);
  + unsigned long goal,
  + u32 flags);
   void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
  - unsigned long limit);
  + unsigned long limit,
  + u32 flags);
   extern void *__alloc_bootmem_low(unsigned long size,
   unsigned long align,
   unsigned long goal);
  @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
   #define alloc_bootmem_align(x

[PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-03-06 Thread Cliff Wickman
From: Cliff Wickman 

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time. 

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
  start_kernel
kernel_init
  do_pre_smp_initcalls
hugetlb_init
  hugetlb_init_hstates
hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

  hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
  __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
__alloc_memory_core_early  NO_ZERO
  if (!(flags & NO_ZERO))
memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

  hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
  __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
alloc_bootmem_core  NO_ZERO
  if (!(flags & NO_ZERO))
memset(region, 0, size);
__alloc_bootmem_nopanic NO_ZERO
  ___alloc_bootmem_nopanic  NO_ZERO
alloc_bootmem_core  NO_ZERO
  if (!(flags & NO_ZERO))
memset(region, 0, size);

Signed-off-by: Cliff Wickman 

---
 arch/x86/kernel/setup_percpu.c |4 ++--
 include/linux/bootmem.h|   23 ---
 mm/bootmem.c   |   12 +++-
 mm/hugetlb.c   |3 ++-
 mm/nobootmem.c |   41 +++--
 mm/page_cgroup.c   |2 +-
 mm/sparse.c|2 +-
 7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
 #include 
 
 /*
+ * allocation flags
+ */
+#define NO_ZERO0x0001
+
+/*
  *  simple boot-time physical memory area allocator.
  */
 
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
 unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 unsigned long align,
-unsigned long goal);
+unsigned long goal,
+u32 flags);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ u32 flags);
 extern void *__alloc_bootmem_low(unsigned long size,
 unsigned long align,
 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
 #define alloc_bootmem_align(x, align) \
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_nopanic(x) \
-   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_nopanic(x) \
-   __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, B

[PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

2013-03-06 Thread Cliff Wickman
From: Cliff Wickman c...@sgi.com

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time. 

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
  start_kernel
kernel_init
  do_pre_smp_initcalls
hugetlb_init
  hugetlb_init_hstates
hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency  number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

  hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
  __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
__alloc_memory_core_early  NO_ZERO
  if (!(flags  NO_ZERO))
memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

  hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
  __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
alloc_bootmem_core  NO_ZERO
  if (!(flags  NO_ZERO))
memset(region, 0, size);
__alloc_bootmem_nopanic NO_ZERO
  ___alloc_bootmem_nopanic  NO_ZERO
alloc_bootmem_core  NO_ZERO
  if (!(flags  NO_ZERO))
memset(region, 0, size);

Signed-off-by: Cliff Wickman c...@sgi.com

---
 arch/x86/kernel/setup_percpu.c |4 ++--
 include/linux/bootmem.h|   23 ---
 mm/bootmem.c   |   12 +++-
 mm/hugetlb.c   |3 ++-
 mm/nobootmem.c |   41 +++--
 mm/page_cgroup.c   |2 +-
 mm/sparse.c|2 +-
 7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
 #include asm/dma.h
 
 /*
+ * allocation flags
+ */
+#define NO_ZERO0x0001
+
+/*
  *  simple boot-time physical memory area allocator.
  */
 
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
 unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 unsigned long align,
-unsigned long goal);
+unsigned long goal,
+u32 flags);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
  unsigned long size,
  unsigned long align,
  unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ u32 flags);
 extern void *__alloc_bootmem_low(unsigned long size,
 unsigned long align,
 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
 #define alloc_bootmem_align(x, align) \
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_nopanic(x) \
-   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
 #define alloc_bootmem_pages_nopanic(x) \
-   __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+   __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
 #define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x

[PATCH] kdump: do not drop entire e820 in crash kernel

2013-02-03 Thread Cliff Wickman
From: Cliff Wickman 

The crash kernel is not able to find its root device if that device is not
on PCI 0.

This is because it is booted with the command line option memmap=exactmap
which currently clears the e820 table.  So ACPI processing does not
find reserved i/o spaces. 

This works for a device on PCI 0 because ACPI falls back to a legacy mode.
But the error message " [Firmware Bug]: PCI: MMCONFIG at
 [mem 0x8000-0x80cf] not reserved in ACPI motherboard resources"
is written to the log even in this functioning case.

It fails for some devices on UV2, and only for UV2, because SGI seems to
be the only manufacturer currently using the extended PCI(>0).

The fix is to not drop the entire e820 table on a memmap=exactmap, but
to preserve all the non-E820_RAM reservations that the BIOS has made.

Signed-off-by: Cliff Wickman 
---
 arch/x86/kernel/e820.c |   14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

Index: linus.current/arch/x86/kernel/e820.c
===
--- linus.current.orig/arch/x86/kernel/e820.c
+++ linus.current/arch/x86/kernel/e820.c
@@ -839,6 +839,8 @@ static int __init parse_memmap_opt(char
 {
char *oldp;
u64 start_at, mem_size;
+   int i;
+   struct e820entry *curp, *availp;
 
if (!p)
return -EINVAL;
@@ -852,7 +854,17 @@ static int __init parse_memmap_opt(char
 */
saved_max_pfn = e820_end_of_ram_pfn();
 #endif
-   e820.nr_map = 0;
+   /* keep everything that was reserved by the BIOS */
+   for (i = 0, curp = [0], availp = [0];
+   i < e820.nr_map; i++, curp++) {
+   if (curp->type != E820_RAM) {
+   if (curp != availp) {
+   *availp = *curp;
+   availp++;
+   }
+   }
+   }
+   e820.nr_map = availp - [0];
userdef = 1;
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] kdump: do not drop entire e820 in crash kernel

2013-02-03 Thread Cliff Wickman
From: Cliff Wickman c...@sgi.com

The crash kernel is not able to find its root device if that device is not
on PCI 0.

This is because it is booted with the command line option memmap=exactmap
which currently clears the e820 table.  So ACPI processing does not
find reserved i/o spaces. 

This works for a device on PCI 0 because ACPI falls back to a legacy mode.
But the error message  [Firmware Bug]: PCI: MMCONFIG at
 [mem 0x8000-0x80cf] not reserved in ACPI motherboard resources
is written to the log even in this functioning case.

It fails for some devices on UV2, and only for UV2, because SGI seems to
be the only manufacturer currently using the extended PCI(0).

The fix is to not drop the entire e820 table on a memmap=exactmap, but
to preserve all the non-E820_RAM reservations that the BIOS has made.

Signed-off-by: Cliff Wickman c...@sgi.com
---
 arch/x86/kernel/e820.c |   14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

Index: linus.current/arch/x86/kernel/e820.c
===
--- linus.current.orig/arch/x86/kernel/e820.c
+++ linus.current/arch/x86/kernel/e820.c
@@ -839,6 +839,8 @@ static int __init parse_memmap_opt(char
 {
char *oldp;
u64 start_at, mem_size;
+   int i;
+   struct e820entry *curp, *availp;
 
if (!p)
return -EINVAL;
@@ -852,7 +854,17 @@ static int __init parse_memmap_opt(char
 */
saved_max_pfn = e820_end_of_ram_pfn();
 #endif
-   e820.nr_map = 0;
+   /* keep everything that was reserved by the BIOS */
+   for (i = 0, curp = e820.map[0], availp = e820.map[0];
+   i  e820.nr_map; i++, curp++) {
+   if (curp-type != E820_RAM) {
+   if (curp != availp) {
+   *availp = *curp;
+   availp++;
+   }
+   }
+   }
+   e820.nr_map = availp - e820.map[0];
userdef = 1;
return 0;
}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] makedumpfile: request the kernel do page scans

2012-12-20 Thread Cliff Wickman
On Thu, Dec 20, 2012 at 12:22:14PM +0900, HATAYAMA Daisuke wrote:
> From: Cliff Wickman 
> Subject: Re: [PATCH] makedumpfile: request the kernel do page scans
> Date: Mon, 10 Dec 2012 09:36:14 -0600
> > On Mon, Dec 10, 2012 at 09:59:29AM +0900, HATAYAMA Daisuke wrote:
> >> From: Cliff Wickman 
> >> Subject: Re: [PATCH] makedumpfile: request the kernel do page scans
> >> Date: Mon, 19 Nov 2012 12:07:10 -0600
> >> 
> >> > On Fri, Nov 16, 2012 at 03:39:44PM -0500, Vivek Goyal wrote:
> >> >> On Thu, Nov 15, 2012 at 04:52:40PM -0600, Cliff Wickman wrote:
> > 
> > Hi Hatayama,
> > 
> > If ioremap/iounmap is the bottleneck then perhaps you could do what
> > my patch does: it consolidates all the ranges of physical addresses
> > where the boot kernel's page structures reside (see make_kernel_mmap())
> > and passes them to the kernel, which then does a handfull of ioremaps's to
> > cover all of them.  Then /proc/vmcore could look up the already-mapped
> > virtual address.
> > (also note a kludge in get_mm_sparsemem() that verifies that each section
> > of the mem_map spans contiguous ranges of page structures.  I had
> > trouble with some sections when I made that assumption)
> > 
> > I'm attaching 3 patches that might be useful in your testing:
> > - 121210.proc_vmcore2  my current patch that applies to the released
> >   makedumpfile 1.5.1
> > - 121207.vmcore_pagescans.sles applies to a 3.0.13 kernel
> > - 121207.vmcore_pagescans.rhel applies to a 2.6.32 kernel
> > 
> 
> I used the same patch set on the benchmark.
> 
> BTW, I have continuously reservation issue, so I think I cannot use
> terabyte memory machine at least in this year.
> 
> Also, your patch set is doing ioremap per a chunk of memory map,
> i.e. a number of consequtive pages at the same time. On your terabyte
> machines, how large they are? We have memory consumption issue on the
> 2nd kernel so we must decrease amount of memory used. But looking into
> ioremap code quickly, it looks not using 2MB or 1GB pages to
> remap. This means more than tera bytes page table is generated. Or
> have you probably already investigated this?
> 
> BTW, my idea to solve this issue are two:
> 
> 1) make linear direct mapping for old memory, and acess the old memory
> via the linear direct mapping, not by ioremap.
> 
>   - adding remap code in vmcore, or passing the regions that need to
> be remapped using memmap= kernel option to tell the 2nd kenrel to
> map them in addition.

Good point.  It would take over 30G of memory to map 16TB with 4k pages.
I recently tried to dump such a memory and ran out of kernel memory --
no wonder!

Do you have a patch for doing a linear direct mapping?  Or can you name
existing kernel infrastructure to do such mapping?  I'm just looking for
a jumpstart to enhance the patch.

-Cliff
> 
> Or,
> 
> 2) Support 2MB or 1GB pages in ioremap.
> 
> Thanks.
> HATAYAMA, Daisuke

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] makedumpfile: request the kernel do page scans

2012-12-20 Thread Cliff Wickman
On Thu, Dec 20, 2012 at 12:22:14PM +0900, HATAYAMA Daisuke wrote:
 From: Cliff Wickman c...@sgi.com
 Subject: Re: [PATCH] makedumpfile: request the kernel do page scans
 Date: Mon, 10 Dec 2012 09:36:14 -0600
  On Mon, Dec 10, 2012 at 09:59:29AM +0900, HATAYAMA Daisuke wrote:
  From: Cliff Wickman c...@sgi.com
  Subject: Re: [PATCH] makedumpfile: request the kernel do page scans
  Date: Mon, 19 Nov 2012 12:07:10 -0600
  
   On Fri, Nov 16, 2012 at 03:39:44PM -0500, Vivek Goyal wrote:
   On Thu, Nov 15, 2012 at 04:52:40PM -0600, Cliff Wickman wrote:
  
  Hi Hatayama,
  
  If ioremap/iounmap is the bottleneck then perhaps you could do what
  my patch does: it consolidates all the ranges of physical addresses
  where the boot kernel's page structures reside (see make_kernel_mmap())
  and passes them to the kernel, which then does a handfull of ioremaps's to
  cover all of them.  Then /proc/vmcore could look up the already-mapped
  virtual address.
  (also note a kludge in get_mm_sparsemem() that verifies that each section
  of the mem_map spans contiguous ranges of page structures.  I had
  trouble with some sections when I made that assumption)
  
  I'm attaching 3 patches that might be useful in your testing:
  - 121210.proc_vmcore2  my current patch that applies to the released
makedumpfile 1.5.1
  - 121207.vmcore_pagescans.sles applies to a 3.0.13 kernel
  - 121207.vmcore_pagescans.rhel applies to a 2.6.32 kernel
  
 
 I used the same patch set on the benchmark.
 
 BTW, I have continuously reservation issue, so I think I cannot use
 terabyte memory machine at least in this year.
 
 Also, your patch set is doing ioremap per a chunk of memory map,
 i.e. a number of consequtive pages at the same time. On your terabyte
 machines, how large they are? We have memory consumption issue on the
 2nd kernel so we must decrease amount of memory used. But looking into
 ioremap code quickly, it looks not using 2MB or 1GB pages to
 remap. This means more than tera bytes page table is generated. Or
 have you probably already investigated this?
 
 BTW, my idea to solve this issue are two:
 
 1) make linear direct mapping for old memory, and acess the old memory
 via the linear direct mapping, not by ioremap.
 
   - adding remap code in vmcore, or passing the regions that need to
 be remapped using memmap= kernel option to tell the 2nd kenrel to
 map them in addition.

Good point.  It would take over 30G of memory to map 16TB with 4k pages.
I recently tried to dump such a memory and ran out of kernel memory --
no wonder!

Do you have a patch for doing a linear direct mapping?  Or can you name
existing kernel infrastructure to do such mapping?  I'm just looking for
a jumpstart to enhance the patch.

-Cliff
 
 Or,
 
 2) Support 2MB or 1GB pages in ioremap.
 
 Thanks.
 HATAYAMA, Daisuke

-- 
Cliff Wickman
SGI
c...@sgi.com
(651) 683-3824
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] UV: fix incorrect tlb flush all issue

2012-09-25 Thread Cliff Wickman
From: Cliff Wickman 

(this was sent as an ack on 9/13, but with incorrect title and sign-off)

Ack.
But with the adjustment below.  The 'end' argument was not declared long.

I tested the patch on a UV.
It has the effect of either clearing 1 or all TLBs in a cpu.
I added some debugging to test for the cases when clearing all TLBs is
overkill, and in practice it happens very seldom.

Reported-by: Jan Beulich 
Signed-off-by: Alex Shi 
Signed-off-by: Cliff Wickman 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: "H. Peter Anvin" 
---
 arch/x86/include/asm/uv/uv.h  |2 +-
 arch/x86/platform/uv/tlb_uv.c |   10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux/arch/x86/platform/uv/tlb_uv.c
===
--- linux.orig/arch/x86/platform/uv/tlb_uv.c
+++ linux/arch/x86/platform/uv/tlb_uv.c
@@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma
  * globally purge translation cache of a virtual address or all TLB's
  * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @start: start virtual address to be removed from TLB
+ * @end: end virtual address to be remove from TLB
  * @cpu: the current cpu
  *
  * This is the entry point for initiating any UV global TLB shootdown.
@@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
-   unsigned end, unsigned int cpu)
+   unsigned long end, unsigned int cpu)
 {
int locals = 0;
int remotes = 0;
@@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other
 
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
-   bau_desc->payload.address = start;
+   if (!end || (end - start) <= PAGE_SIZE)
+   bau_desc->payload.address = start;
+   else
+   bau_desc->payload.address = TLB_FLUSH_ALL;
bau_desc->payload.sending_cpu = cpu;
/*
 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
Index: linux/arch/x86/include/asm/uv/uv.h
===
--- linux.orig/arch/x86/include/asm/uv/uv.h
+++ linux/arch/x86/include/asm/uv/uv.h
@@ -16,7 +16,7 @@ extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 struct mm_struct *mm,
 unsigned long start,
-unsigned end,
+unsigned long end,
 unsigned int cpu);
 
 #else  /* X86_UV */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] UV: fix incorrect tlb flush all issue

2012-09-25 Thread Cliff Wickman
From: Cliff Wickman c...@sgi.com

(this was sent as an ack on 9/13, but with incorrect title and sign-off)

Ack.
But with the adjustment below.  The 'end' argument was not declared long.

I tested the patch on a UV.
It has the effect of either clearing 1 or all TLBs in a cpu.
I added some debugging to test for the cases when clearing all TLBs is
overkill, and in practice it happens very seldom.

Reported-by: Jan Beulich jbeul...@suse.com
Signed-off-by: Alex Shi alex@intel.com
Signed-off-by: Cliff Wickman c...@sgi.com
Cc: Ingo Molnar mi...@elte.hu
Cc: Thomas Gleixner t...@linutronix.de
Cc: H. Peter Anvin h...@zytor.com
---
 arch/x86/include/asm/uv/uv.h  |2 +-
 arch/x86/platform/uv/tlb_uv.c |   10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux/arch/x86/platform/uv/tlb_uv.c
===
--- linux.orig/arch/x86/platform/uv/tlb_uv.c
+++ linux/arch/x86/platform/uv/tlb_uv.c
@@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma
  * globally purge translation cache of a virtual address or all TLB's
  * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @start: start virtual address to be removed from TLB
+ * @end: end virtual address to be remove from TLB
  * @cpu: the current cpu
  *
  * This is the entry point for initiating any UV global TLB shootdown.
@@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
-   unsigned end, unsigned int cpu)
+   unsigned long end, unsigned int cpu)
 {
int locals = 0;
int remotes = 0;
@@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other
 
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
-   bau_desc-payload.address = start;
+   if (!end || (end - start) = PAGE_SIZE)
+   bau_desc-payload.address = start;
+   else
+   bau_desc-payload.address = TLB_FLUSH_ALL;
bau_desc-payload.sending_cpu = cpu;
/*
 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
Index: linux/arch/x86/include/asm/uv/uv.h
===
--- linux.orig/arch/x86/include/asm/uv/uv.h
+++ linux/arch/x86/include/asm/uv/uv.h
@@ -16,7 +16,7 @@ extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 struct mm_struct *mm,
 unsigned long start,
-unsigned end,
+unsigned long end,
 unsigned int cpu);
 
 #else  /* X86_UV */
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [alex....@intel.com: Re: [PATCH] UV: fix incorrect tlb flush all issue]

2012-09-13 Thread Cliff Wickman
On Thu, Sep 13, 2012 at 05:53:10PM +0200, Ingo Molnar wrote:
> 
> Ack?
> 
> Thanks,
> 
>   Ingo

Ack.
But with the adjustment below.  The 'end' argument was not declared long.

I tested the patch on a UV.
It has the effect of either clearing 1 or all TLBs in a cpu.
I added some debugging to test for the cases when clearing all TLBs is
overkill, and in practice it happens very seldom.

Sorry I didn't participate in this patch earlier.  Jack Steiner was
copied, I believe.  But stei...@sgi.com is no longer active. Jack has
retired -- congratulations to him, but a very big loss to us, both 
professionally and personally.

-Cliff

Reported-by: Jan Beulich 
Signed-off-by: Alex Shi 
Acked-by: Cliff Wickman 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: "H. Peter Anvin" 
---
 arch/x86/include/asm/uv/uv.h  |2 +-
 arch/x86/platform/uv/tlb_uv.c |   10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux/arch/x86/platform/uv/tlb_uv.c
===
--- linux.orig/arch/x86/platform/uv/tlb_uv.c
+++ linux/arch/x86/platform/uv/tlb_uv.c
@@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma
  * globally purge translation cache of a virtual address or all TLB's
  * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @start: start virtual address to be removed from TLB
+ * @end: end virtual address to be remove from TLB
  * @cpu: the current cpu
  *
  * This is the entry point for initiating any UV global TLB shootdown.
@@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
-   unsigned end, unsigned int cpu)
+   unsigned long end, unsigned int cpu)
 {
int locals = 0;
int remotes = 0;
@@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other
 
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
-   bau_desc->payload.address = start;
+   if (!end || (end - start) <= PAGE_SIZE)
+   bau_desc->payload.address = start;
+   else
+   bau_desc->payload.address = TLB_FLUSH_ALL;
bau_desc->payload.sending_cpu = cpu;
/*
 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
Index: linux/arch/x86/include/asm/uv/uv.h
===
--- linux.orig/arch/x86/include/asm/uv/uv.h
+++ linux/arch/x86/include/asm/uv/uv.h
@@ -16,7 +16,7 @@ extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 struct mm_struct *mm,
 unsigned long start,
-unsigned end,
+unsigned long end,
 unsigned int cpu);
 
 #else  /* X86_UV */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [alex....@intel.com: Re: [PATCH] UV: fix incorrect tlb flush all issue]

2012-09-13 Thread Cliff Wickman
On Thu, Sep 13, 2012 at 05:53:10PM +0200, Ingo Molnar wrote:
 
 Ack?
 
 Thanks,
 
   Ingo

Ack.
But with the adjustment below.  The 'end' argument was not declared long.

I tested the patch on a UV.
It has the effect of either clearing 1 or all TLBs in a cpu.
I added some debugging to test for the cases when clearing all TLBs is
overkill, and in practice it happens very seldom.

Sorry I didn't participate in this patch earlier.  Jack Steiner was
copied, I believe.  But stei...@sgi.com is no longer active. Jack has
retired -- congratulations to him, but a very big loss to us, both 
professionally and personally.

-Cliff

Reported-by: Jan Beulich jbeul...@suse.com
Signed-off-by: Alex Shi alex@intel.com
Acked-by: Cliff Wickman c...@sgi.com
Cc: Ingo Molnar mi...@elte.hu
Cc: Thomas Gleixner t...@linutronix.de
Cc: H. Peter Anvin h...@zytor.com
---
 arch/x86/include/asm/uv/uv.h  |2 +-
 arch/x86/platform/uv/tlb_uv.c |   10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux/arch/x86/platform/uv/tlb_uv.c
===
--- linux.orig/arch/x86/platform/uv/tlb_uv.c
+++ linux/arch/x86/platform/uv/tlb_uv.c
@@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma
  * globally purge translation cache of a virtual address or all TLB's
  * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @start: start virtual address to be removed from TLB
+ * @end: end virtual address to be remove from TLB
  * @cpu: the current cpu
  *
  * This is the entry point for initiating any UV global TLB shootdown.
@@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
-   unsigned end, unsigned int cpu)
+   unsigned long end, unsigned int cpu)
 {
int locals = 0;
int remotes = 0;
@@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other
 
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
-   bau_desc-payload.address = start;
+   if (!end || (end - start) = PAGE_SIZE)
+   bau_desc-payload.address = start;
+   else
+   bau_desc-payload.address = TLB_FLUSH_ALL;
bau_desc-payload.sending_cpu = cpu;
/*
 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
Index: linux/arch/x86/include/asm/uv/uv.h
===
--- linux.orig/arch/x86/include/asm/uv/uv.h
+++ linux/arch/x86/include/asm/uv/uv.h
@@ -16,7 +16,7 @@ extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 struct mm_struct *mm,
 unsigned long start,
-unsigned end,
+unsigned long end,
 unsigned int cpu);
 
 #else  /* X86_UV */
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4 v2] cpusets: update_cpumask documentation fix

2008-02-04 Thread Cliff Wickman

Update cpuset documentation to match the October 2007
"Fix cpusets update_cpumask" changes that now apply
changes to a cpusets 'cpus' allowed mask immediately
to the cpus_allowed of the tasks in that cpuset.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 Documentation/cpusets.txt |   23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

Index: linux-2.6/Documentation/cpusets.txt
===
--- linux-2.6.orig/Documentation/cpusets.txt
+++ linux-2.6/Documentation/cpusets.txt
@@ -523,21 +523,14 @@ from one cpuset to another, then the ker
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.
 
-If a cpuset has its CPUs modified, then each task using that
-cpuset does _not_ change its behavior automatically.  In order to
-minimize the impact on the critical scheduling code in the kernel,
-tasks will continue to use their prior CPU placement until they
-are rebound to their cpuset, by rewriting their pid to the 'tasks'
-file of their cpuset.  If a task had been bound to some subset of its
-cpuset using the sched_setaffinity() call, and if any of that subset
-is still allowed in its new cpuset settings, then the task will be
-restricted to the intersection of the CPUs it was allowed on before,
-and its new cpuset CPU placement.  If, on the other hand, there is
-no overlap between a tasks prior placement and its new cpuset CPU
-placement, then the task will be allowed to run on any CPU allowed
-in its new cpuset.  If a task is moved from one cpuset to another,
-its CPU placement is updated in the same way as if the tasks pid is
-rewritten to the 'tasks' file of its current cpuset.
+If a cpuset has its 'cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a tasks pid is written to a cpusets 'tasks' file, in either its
+current cpuset or another cpuset, then its allowed CPU placement is
+changed immediately.  If such a task had been bound to some subset
+of its cpuset using the sched_setaffinity() call, the task will be
+allowed to run on any CPU allowed in its new cpuset, negating the
+affect of the prior sched_setaffinity() call.
 
 In summary, the memory placement of a task whose cpuset is changed is
 updated by the kernel, on the next allocation of a page for that task,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4 v2] hotplug cpu move tasks in empty cpusets - refinements

2008-02-04 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1709,7 +1709,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs->parent;
-   while (cpus_empty(parent->cpus_allowed))
+   while (cpus_empty(parent->cpus_allowed) ||
+   nodes_empty(parent->mems_allowed))
parent = parent->parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1741,7 +1742,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)>stack_list, );
 
-   mutex_lock(_mutex);
while (!list_empty()) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1750,19 +1750,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(>stack_list, );
}
cont = cp->css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+   nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
-nodes_empty(cp->mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(_mutex);
+nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(_mutex);
-   }
}
-   mutex_unlock(_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4 v2] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-02-04 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had "iff", meaning "if and only if" in a comment.
However, except for ancient mathematicians, the abbreviation
"iff" was a tad too cryptic.  Cliff changed it to "if",
presumably figuring that the "iff" was a typo.  However, it
was the "only if" half of the conjunction that was most
interesting.  Reword to emphasis the "only if" aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, >flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 kernel/cpuset.c |   41 +++--
 1 file changed, 15 insertions(+), 26 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -752,7 +752,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -809,7 +809,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call holding cgroup_mutex, so current's cpuset won't change
- *during this call, as cgroup_mutex holds off any attach_task()
+ *during this call, as manage_mutex holds off any cpuset_attach()
  *calls.  Therefore we don't need to take task_lock around the
  *call to guarantee_online_mems(), as we know no one is changing
  *our task's cpuset.
@@ -1661,8 +1661,8 @@ void cpuset_do_move_task(struct task_str
  * @from: cpuset in which the tasks currently reside
  * @to: cpuset to which the tasks will be moved
  *
- * Called with manage_sem held
- * callback_mutex must not be held, as attach_task() will take it.
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  * calling callback functions for each.
@@ -1689,18 +1689,18 @@ static void move_member_tasks_to_cpuset(
  * last CPU or node from a cpuset, then move the tasks in the empty
  * cpuset to its next-highest non-empty parent.
  *
- * The parent cpuset has some superset of the 'mems' nodes that the
- * newly empty cpuset held, so no migration of memory is necessary.
- *
- * Called with both manage_sem and callback_sem held
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
  */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
struct cpuset *parent;
 
-   /* the cgroup's css_sets list is in use if there are tasks
-  in the cpuset; the list is empty if there are none;
-  the cs->css.refcnt seems always 0 */
+   /*
+* The cgroup's css_sets list is in use if there are tasks
+* in the cpuset; the list is empty if there are none;
+* the cs->css.refcnt seems always 0.
+*/
if (list_empty(>css.cgroup->css_sets))
return;
 
@@ -1709,14 +1709,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs->parent;
-   while (cpus_empty(parent->cpus_allowed)) {
-   /*
-* this empty cpuset should now be considered to
-* have been used, and therefore eligible for
-* release when empty (if it is notify_on_release)
-*/
+   while (cpus_empty(parent->cpus_allowed))
parent = parent->parent;
-   }
 
move_member_tasks_to_cpuset(cs, parent);
 }
@@ -1725,10 +1719,6 @@ static void remove_tasks_in_empty_cpuset
  * Walk the specified cpuset subtree and look for empty cpusets.
  * The tasks of such cpus

[PATCH 1/4 v2] hotplug cpu move tasks in empty cpusets to parent node_online_map fix

2008-02-04 Thread Cliff Wickman

As of the October 2007 kernel/cpuset.c patch "Memoryless nodes:
Use N_HIGH_MEMORY for cpusets", cpuset nodes are relative to
the nodes with (HIGH) memory, not relative to all nodes in
node_online_map.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 kernel/cpuset.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1762,7 +1762,8 @@ static void scan_for_empty_cpusets(const
cont = cp->css.cgroup;
/* Remove offline cpus and mems from this cpuset. */
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
-   nodes_and(cp->mems_allowed, cp->mems_allowed, node_online_map);
+   nodes_and(cp->mems_allowed, cp->mems_allowed,
+   node_states[N_HIGH_MEMORY]);
if ((cpus_empty(cp->cpus_allowed) ||
 nodes_empty(cp->mems_allowed))) {
/* Move tasks from the empty cpuset to a parent */
@@ -1777,8 +1778,8 @@ static void scan_for_empty_cpusets(const
 
 /*
  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_online_map.  Force the top cpuset to track
- * whats online after any CPU or memory node hotplug or unplug event.
+ * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
+ * track what's online after any CPU or memory node hotplug or unplug event.
  *
  * Since there are two callers of this routine, one for CPU hotplug
  * events and one for memory node hotplug events, we could have coded
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4 v2] cpusets: update_cpumask documentation fix

2008-02-04 Thread Cliff Wickman

Update cpuset documentation to match the October 2007
Fix cpusets update_cpumask changes that now apply
changes to a cpusets 'cpus' allowed mask immediately
to the cpus_allowed of the tasks in that cpuset.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 Documentation/cpusets.txt |   23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

Index: linux-2.6/Documentation/cpusets.txt
===
--- linux-2.6.orig/Documentation/cpusets.txt
+++ linux-2.6/Documentation/cpusets.txt
@@ -523,21 +523,14 @@ from one cpuset to another, then the ker
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.
 
-If a cpuset has its CPUs modified, then each task using that
-cpuset does _not_ change its behavior automatically.  In order to
-minimize the impact on the critical scheduling code in the kernel,
-tasks will continue to use their prior CPU placement until they
-are rebound to their cpuset, by rewriting their pid to the 'tasks'
-file of their cpuset.  If a task had been bound to some subset of its
-cpuset using the sched_setaffinity() call, and if any of that subset
-is still allowed in its new cpuset settings, then the task will be
-restricted to the intersection of the CPUs it was allowed on before,
-and its new cpuset CPU placement.  If, on the other hand, there is
-no overlap between a tasks prior placement and its new cpuset CPU
-placement, then the task will be allowed to run on any CPU allowed
-in its new cpuset.  If a task is moved from one cpuset to another,
-its CPU placement is updated in the same way as if the tasks pid is
-rewritten to the 'tasks' file of its current cpuset.
+If a cpuset has its 'cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a tasks pid is written to a cpusets 'tasks' file, in either its
+current cpuset or another cpuset, then its allowed CPU placement is
+changed immediately.  If such a task had been bound to some subset
+of its cpuset using the sched_setaffinity() call, the task will be
+allowed to run on any CPU allowed in its new cpuset, negating the
+affect of the prior sched_setaffinity() call.
 
 In summary, the memory placement of a task whose cpuset is changed is
 updated by the kernel, on the next allocation of a page for that task,
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4 v2] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-02-04 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had iff, meaning if and only if in a comment.
However, except for ancient mathematicians, the abbreviation
iff was a tad too cryptic.  Cliff changed it to if,
presumably figuring that the iff was a typo.  However, it
was the only if half of the conjunction that was most
interesting.  Reword to emphasis the only if aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, parent-flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 kernel/cpuset.c |   41 +++--
 1 file changed, 15 insertions(+), 26 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -752,7 +752,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -809,7 +809,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call holding cgroup_mutex, so current's cpuset won't change
- *during this call, as cgroup_mutex holds off any attach_task()
+ *during this call, as manage_mutex holds off any cpuset_attach()
  *calls.  Therefore we don't need to take task_lock around the
  *call to guarantee_online_mems(), as we know no one is changing
  *our task's cpuset.
@@ -1661,8 +1661,8 @@ void cpuset_do_move_task(struct task_str
  * @from: cpuset in which the tasks currently reside
  * @to: cpuset to which the tasks will be moved
  *
- * Called with manage_sem held
- * callback_mutex must not be held, as attach_task() will take it.
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  * calling callback functions for each.
@@ -1689,18 +1689,18 @@ static void move_member_tasks_to_cpuset(
  * last CPU or node from a cpuset, then move the tasks in the empty
  * cpuset to its next-highest non-empty parent.
  *
- * The parent cpuset has some superset of the 'mems' nodes that the
- * newly empty cpuset held, so no migration of memory is necessary.
- *
- * Called with both manage_sem and callback_sem held
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
  */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
struct cpuset *parent;
 
-   /* the cgroup's css_sets list is in use if there are tasks
-  in the cpuset; the list is empty if there are none;
-  the cs-css.refcnt seems always 0 */
+   /*
+* The cgroup's css_sets list is in use if there are tasks
+* in the cpuset; the list is empty if there are none;
+* the cs-css.refcnt seems always 0.
+*/
if (list_empty(cs-css.cgroup-css_sets))
return;
 
@@ -1709,14 +1709,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs-parent;
-   while (cpus_empty(parent-cpus_allowed)) {
-   /*
-* this empty cpuset should now be considered to
-* have been used, and therefore eligible for
-* release when empty (if it is notify_on_release)
-*/
+   while (cpus_empty(parent-cpus_allowed))
parent = parent-parent;
-   }
 
move_member_tasks_to_cpuset(cs, parent);
 }
@@ -1725,10 +1719,6 @@ static void remove_tasks_in_empty_cpuset
  * Walk the specified cpuset subtree and look for empty cpusets.
  * The tasks of such cpuset must be moved to a parent cpuset.
  *
- * Note that such a notify_on_release cpuset must have had, at some time,
- * member

[PATCH 1/4 v2] hotplug cpu move tasks in empty cpusets to parent node_online_map fix

2008-02-04 Thread Cliff Wickman

As of the October 2007 kernel/cpuset.c patch Memoryless nodes:
Use N_HIGH_MEMORY for cpusets, cpuset nodes are relative to
the nodes with (HIGH) memory, not relative to all nodes in
node_online_map.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 kernel/cpuset.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1762,7 +1762,8 @@ static void scan_for_empty_cpusets(const
cont = cp-css.cgroup;
/* Remove offline cpus and mems from this cpuset. */
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
-   nodes_and(cp-mems_allowed, cp-mems_allowed, node_online_map);
+   nodes_and(cp-mems_allowed, cp-mems_allowed,
+   node_states[N_HIGH_MEMORY]);
if ((cpus_empty(cp-cpus_allowed) ||
 nodes_empty(cp-mems_allowed))) {
/* Move tasks from the empty cpuset to a parent */
@@ -1777,8 +1778,8 @@ static void scan_for_empty_cpusets(const
 
 /*
  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_online_map.  Force the top cpuset to track
- * whats online after any CPU or memory node hotplug or unplug event.
+ * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
+ * track what's online after any CPU or memory node hotplug or unplug event.
  *
  * Since there are two callers of this routine, one for CPU hotplug
  * events and one for memory node hotplug events, we could have coded
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4 v2] hotplug cpu move tasks in empty cpusets - refinements

2008-02-04 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1709,7 +1709,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs-parent;
-   while (cpus_empty(parent-cpus_allowed))
+   while (cpus_empty(parent-cpus_allowed) ||
+   nodes_empty(parent-mems_allowed))
parent = parent-parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1741,7 +1742,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)root-stack_list, queue);
 
-   mutex_lock(callback_mutex);
while (!list_empty(queue)) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1750,19 +1750,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(child-stack_list, queue);
}
cont = cp-css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp-cpus_allowed, cpu_online_map) 
+   nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(callback_mutex);
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
nodes_and(cp-mems_allowed, cp-mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(callback_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp-cpus_allowed) ||
-nodes_empty(cp-mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(callback_mutex);
+nodes_empty(cp-mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(callback_mutex);
-   }
}
-   mutex_unlock(callback_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] hotplug cpu move tasks in empty cpusets - refinements

2008-01-29 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs->parent;
-   while (cpus_empty(parent->cpus_allowed))
+   while (cpus_empty(parent->cpus_allowed) ||
+   nodes_empty(parent->mems_allowed))
parent = parent->parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)>stack_list, );
 
-   mutex_lock(_mutex);
while (!list_empty()) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(>stack_list, );
}
cont = cp->css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+   nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
-nodes_empty(cp->mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(_mutex);
+nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(_mutex);
-   }
}
-   mutex_unlock(_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] hotplug cpu move tasks in empty cpusets to parent node_online_map fix

2008-01-29 Thread Cliff Wickman

As of the October 2007 kernel/cpuset.c patch "Memoryless nodes:
Use N_HIGH_MEMORY for cpusets", cpuset nodes are relative to
the nodes with (HIGH) memory, not relative to all nodes in
node_online_map.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 kernel/cpuset.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1801,7 +1801,8 @@ static void scan_for_empty_cpusets(const
cont = cp->css.cgroup;
/* Remove offline cpus and mems from this cpuset. */
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
-   nodes_and(cp->mems_allowed, cp->mems_allowed, node_online_map);
+   nodes_and(cp->mems_allowed, cp->mems_allowed,
+   node_states[N_HIGH_MEMORY]);
if ((cpus_empty(cp->cpus_allowed) ||
 nodes_empty(cp->mems_allowed))) {
/* Move tasks from the empty cpuset to a parent */
@@ -1816,8 +1817,8 @@ static void scan_for_empty_cpusets(const
 
 /*
  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_online_map.  Force the top cpuset to track
- * whats online after any CPU or memory node hotplug or unplug event.
+ * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
+ * track what's online after any CPU or memory node hotplug or unplug event.
  *
  * Since there are two callers of this routine, one for CPU hotplug
  * events and one for memory node hotplug events, we could have coded
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] cpusets: update_cpumask documentation fix

2008-01-29 Thread Cliff Wickman

Update cpuset documentation to match the October 2007
"Fix cpusets update_cpumask" changes that now apply
changes to a cpusets 'cpus' allowed mask immediately
to the cpus_allowed of the tasks in that cpuset.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 Documentation/cpusets.txt |   23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

Index: linux-2.6/Documentation/cpusets.txt
===
--- linux-2.6.orig/Documentation/cpusets.txt
+++ linux-2.6/Documentation/cpusets.txt
@@ -523,21 +523,14 @@ from one cpuset to another, then the ker
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.
 
-If a cpuset has its CPUs modified, then each task using that
-cpuset does _not_ change its behavior automatically.  In order to
-minimize the impact on the critical scheduling code in the kernel,
-tasks will continue to use their prior CPU placement until they
-are rebound to their cpuset, by rewriting their pid to the 'tasks'
-file of their cpuset.  If a task had been bound to some subset of its
-cpuset using the sched_setaffinity() call, and if any of that subset
-is still allowed in its new cpuset settings, then the task will be
-restricted to the intersection of the CPUs it was allowed on before,
-and its new cpuset CPU placement.  If, on the other hand, there is
-no overlap between a tasks prior placement and its new cpuset CPU
-placement, then the task will be allowed to run on any CPU allowed
-in its new cpuset.  If a task is moved from one cpuset to another,
-its CPU placement is updated in the same way as if the tasks pid is
-rewritten to the 'tasks' file of its current cpuset.
+If a cpuset has its 'cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a tasks pid is written to a cpusets 'tasks' file, in either its
+current cpuset or another cpuset, then its allowed CPU placement is
+changed immediately.  If such a task had been bound to some subset
+of its cpuset using the sched_setaffinity() call, the task will be
+allowed to run on any CPU allowed in its new cpuset, negating the
+affect of the prior sched_setaffinity() call.
 
 In summary, the memory placement of a task whose cpuset is changed is
 updated by the kernel, on the next allocation of a page for that task,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[4/4] hotplug cpu move tasks in empty cpusets - refinements

2008-01-29 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs->parent;
-   while (cpus_empty(parent->cpus_allowed))
+   while (cpus_empty(parent->cpus_allowed) ||
+   nodes_empty(parent->mems_allowed))
parent = parent->parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)>stack_list, );
 
-   mutex_lock(_mutex);
while (!list_empty()) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(>stack_list, );
}
cont = cp->css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+   nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
-nodes_empty(cp->mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(_mutex);
+nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(_mutex);
-   }
}
-   mutex_unlock(_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-01-29 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had "iff", meaning "if and only if" in a comment.
However, except for ancient mathematicians, the abbreviation
"iff" was a tad too cryptic.  Cliff changed it to "if",
presumably figuring that the "iff" was a typo.  However, it
was the "only if" half of the conjunction that was most
interesting.  Reword to emphasis the "only if" aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, >flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 kernel/cpuset.c |   53 +
 1 file changed, 21 insertions(+), 32 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -167,7 +167,7 @@ static inline int is_spread_slab(const s
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
  *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach() could
  * reattach a task to a different cpuset, which must not have its
  * generation numbers aliased with those of that tasks previous cpuset.
  *
@@ -218,7 +218,7 @@ static struct cpuset top_cpuset = {
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
+ * zero, then only cpuset_attach(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
@@ -255,18 +255,18 @@ static struct cpuset top_cpuset = {
  *
  * The task_lock() exception
  *
- * The need for this exception arises from the action of attach_task(),
+ * The need for this exception arises from the action of cpuset_attach(),
  * which overwrites one tasks cpuset pointer with another.  It does
  * so using both mutexes, however there are several performance
  * critical places that need to reference task->cpuset without the
  * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
+ * noted below, when dereferencing or, as in cpuset_attach(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
  * such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
+ * update of a tasks cpuset pointer by cpuset_attach() and the
  * access of task->cpuset->mems_generation via that pointer in
  * the routine cpuset_update_task_memory_state().
  */
@@ -368,7 +368,7 @@ static void guarantee_online_mems(const 
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
  * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset by attach_task(),
+ * from concurrent freeing of current->cpuset by cpuset_attach(),
  * using RCU.
  *
  * The rcu_dereference() is technically probably not needed,
@@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call 

[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-01-29 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had "iff", meaning "if and only if" in a comment.
However, except for ancient mathematicians, the abbreviation
"iff" was a tad too cryptic.  Cliff changed it to "if",
presumably figuring that the "iff" was a typo.  However, it
was the "only if" half of the conjunction that was most
interesting.  Reword to emphasis the "only if" aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, >flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
Acked-by: Cliff Wickman <[EMAIL PROTECTED]>

---

 kernel/cpuset.c |   53 +
 1 file changed, 21 insertions(+), 32 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -167,7 +167,7 @@ static inline int is_spread_slab(const s
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
  *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach() could
  * reattach a task to a different cpuset, which must not have its
  * generation numbers aliased with those of that tasks previous cpuset.
  *
@@ -218,7 +218,7 @@ static struct cpuset top_cpuset = {
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
+ * zero, then only cpuset_attach(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
@@ -255,18 +255,18 @@ static struct cpuset top_cpuset = {
  *
  * The task_lock() exception
  *
- * The need for this exception arises from the action of attach_task(),
+ * The need for this exception arises from the action of cpuset_attach(),
  * which overwrites one tasks cpuset pointer with another.  It does
  * so using both mutexes, however there are several performance
  * critical places that need to reference task->cpuset without the
  * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
+ * noted below, when dereferencing or, as in cpuset_attach(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
  * such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
+ * update of a tasks cpuset pointer by cpuset_attach() and the
  * access of task->cpuset->mems_generation via that pointer in
  * the routine cpuset_update_task_memory_state().
  */
@@ -368,7 +368,7 @@ static void guarantee_online_mems(const 
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
  * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset by attach_task(),
+ * from concurrent freeing of current->cpuset by cpuset_attach(),
  * using RCU.
  *
  * The rcu_dereference() is technically probably not needed,
@@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call 

[PATCH 1/4] hotplug cpu move tasks in empty cpusets to parent node_online_map fix

2008-01-29 Thread Cliff Wickman

As of the October 2007 kernel/cpuset.c patch Memoryless nodes:
Use N_HIGH_MEMORY for cpusets, cpuset nodes are relative to
the nodes with (HIGH) memory, not relative to all nodes in
node_online_map.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 kernel/cpuset.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1801,7 +1801,8 @@ static void scan_for_empty_cpusets(const
cont = cp-css.cgroup;
/* Remove offline cpus and mems from this cpuset. */
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
-   nodes_and(cp-mems_allowed, cp-mems_allowed, node_online_map);
+   nodes_and(cp-mems_allowed, cp-mems_allowed,
+   node_states[N_HIGH_MEMORY]);
if ((cpus_empty(cp-cpus_allowed) ||
 nodes_empty(cp-mems_allowed))) {
/* Move tasks from the empty cpuset to a parent */
@@ -1816,8 +1817,8 @@ static void scan_for_empty_cpusets(const
 
 /*
  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_online_map.  Force the top cpuset to track
- * whats online after any CPU or memory node hotplug or unplug event.
+ * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
+ * track what's online after any CPU or memory node hotplug or unplug event.
  *
  * Since there are two callers of this routine, one for CPU hotplug
  * events and one for memory node hotplug events, we could have coded
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] hotplug cpu move tasks in empty cpusets - refinements

2008-01-29 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs-parent;
-   while (cpus_empty(parent-cpus_allowed))
+   while (cpus_empty(parent-cpus_allowed) ||
+   nodes_empty(parent-mems_allowed))
parent = parent-parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)root-stack_list, queue);
 
-   mutex_lock(callback_mutex);
while (!list_empty(queue)) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(child-stack_list, queue);
}
cont = cp-css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp-cpus_allowed, cpu_online_map) 
+   nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(callback_mutex);
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
nodes_and(cp-mems_allowed, cp-mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(callback_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp-cpus_allowed) ||
-nodes_empty(cp-mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(callback_mutex);
+nodes_empty(cp-mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(callback_mutex);
-   }
}
-   mutex_unlock(callback_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] cpusets: update_cpumask documentation fix

2008-01-29 Thread Cliff Wickman

Update cpuset documentation to match the October 2007
Fix cpusets update_cpumask changes that now apply
changes to a cpusets 'cpus' allowed mask immediately
to the cpus_allowed of the tasks in that cpuset.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 Documentation/cpusets.txt |   23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

Index: linux-2.6/Documentation/cpusets.txt
===
--- linux-2.6.orig/Documentation/cpusets.txt
+++ linux-2.6/Documentation/cpusets.txt
@@ -523,21 +523,14 @@ from one cpuset to another, then the ker
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.
 
-If a cpuset has its CPUs modified, then each task using that
-cpuset does _not_ change its behavior automatically.  In order to
-minimize the impact on the critical scheduling code in the kernel,
-tasks will continue to use their prior CPU placement until they
-are rebound to their cpuset, by rewriting their pid to the 'tasks'
-file of their cpuset.  If a task had been bound to some subset of its
-cpuset using the sched_setaffinity() call, and if any of that subset
-is still allowed in its new cpuset settings, then the task will be
-restricted to the intersection of the CPUs it was allowed on before,
-and its new cpuset CPU placement.  If, on the other hand, there is
-no overlap between a tasks prior placement and its new cpuset CPU
-placement, then the task will be allowed to run on any CPU allowed
-in its new cpuset.  If a task is moved from one cpuset to another,
-its CPU placement is updated in the same way as if the tasks pid is
-rewritten to the 'tasks' file of its current cpuset.
+If a cpuset has its 'cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a tasks pid is written to a cpusets 'tasks' file, in either its
+current cpuset or another cpuset, then its allowed CPU placement is
+changed immediately.  If such a task had been bound to some subset
+of its cpuset using the sched_setaffinity() call, the task will be
+allowed to run on any CPU allowed in its new cpuset, negating the
+affect of the prior sched_setaffinity() call.
 
 In summary, the memory placement of a task whose cpuset is changed is
 updated by the kernel, on the next allocation of a page for that task,
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[4/4] hotplug cpu move tasks in empty cpusets - refinements

2008-01-29 Thread Cliff Wickman

   Narrow the scope of callback_mutex in scan_for_empty_cpusets().
Avoid rewriting the cpus, mems of cpusets except when it is
 likely that we'll be changing them.
Have remove_tasks_in_empty_cpuset() also check for empty mems.

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]


---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs-parent;
-   while (cpus_empty(parent-cpus_allowed))
+   while (cpus_empty(parent-cpus_allowed) ||
+   nodes_empty(parent-mems_allowed))
parent = parent-parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)root-stack_list, queue);
 
-   mutex_lock(callback_mutex);
while (!list_empty(queue)) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(child-stack_list, queue);
}
cont = cp-css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp-cpus_allowed, cpu_online_map) 
+   nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(callback_mutex);
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
nodes_and(cp-mems_allowed, cp-mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(callback_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp-cpus_allowed) ||
-nodes_empty(cp-mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(callback_mutex);
+nodes_empty(cp-mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(callback_mutex);
-   }
}
-   mutex_unlock(callback_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-01-29 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had iff, meaning if and only if in a comment.
However, except for ancient mathematicians, the abbreviation
iff was a tad too cryptic.  Cliff changed it to if,
presumably figuring that the iff was a typo.  However, it
was the only if half of the conjunction that was most
interesting.  Reword to emphasis the only if aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, parent-flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 kernel/cpuset.c |   53 +
 1 file changed, 21 insertions(+), 32 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -167,7 +167,7 @@ static inline int is_spread_slab(const s
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
  *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach() could
  * reattach a task to a different cpuset, which must not have its
  * generation numbers aliased with those of that tasks previous cpuset.
  *
@@ -218,7 +218,7 @@ static struct cpuset top_cpuset = {
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
+ * zero, then only cpuset_attach(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
@@ -255,18 +255,18 @@ static struct cpuset top_cpuset = {
  *
  * The task_lock() exception
  *
- * The need for this exception arises from the action of attach_task(),
+ * The need for this exception arises from the action of cpuset_attach(),
  * which overwrites one tasks cpuset pointer with another.  It does
  * so using both mutexes, however there are several performance
  * critical places that need to reference task-cpuset without the
  * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
+ * noted below, when dereferencing or, as in cpuset_attach(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task-alloc_lock) already in the task_struct routinely used for
  * such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
+ * update of a tasks cpuset pointer by cpuset_attach() and the
  * access of task-cpuset-mems_generation via that pointer in
  * the routine cpuset_update_task_memory_state().
  */
@@ -368,7 +368,7 @@ static void guarantee_online_mems(const 
  *
  * Reading current-cpuset-mems_generation doesn't need task_lock
  * to guard the current-cpuset derefence, because it is guarded
- * from concurrent freeing of current-cpuset by attach_task(),
+ * from concurrent freeing of current-cpuset by cpuset_attach(),
  * using RCU.
  *
  * The rcu_dereference() is technically probably not needed,
@@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call holding manage_mutex, so our current-cpuset won't change
- *during this call, as manage_mutex holds off any attach_task

[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes

2008-01-29 Thread Cliff Wickman

Various minor formatting and comment tweaks to Cliff Wickman's
[PATCH_3_of_3]_cpusets__update_cpumask_revision.patch

I had had iff, meaning if and only if in a comment.
However, except for ancient mathematicians, the abbreviation
iff was a tad too cryptic.  Cliff changed it to if,
presumably figuring that the iff was a typo.  However, it
was the only if half of the conjunction that was most
interesting.  Reword to emphasis the only if aspect.

The locking comment for remove_tasks_in_empty_cpuset() was wrong;
it said callback_mutex had to be held on entry.  The opposite
is true.

Several mentions of attach_task() in comments needed to be
changed to cgroup_attach_task().

A comment about notify_on_release was no longer relevant,
as the line of code it had commented, namely:
set_bit(CS_RELEASED_RESOURCE, parent-flags);
is no longer present in that place in the cpuset.c code.

Similarly a comment about notify_on_release before the
scan_for_empty_cpusets() routine was no longer relevant.

Removed extra parentheses and unnecessary return statement.

Renamed attach_task() to cpuset_attach() in various comments.

Removed comment about not needing memory migration, as it
seems the migration is done anyway, via the cpuset_attach()
callback from cgroup_attach_task().

Signed-off-by: Paul Jackson [EMAIL PROTECTED]
Acked-by: Cliff Wickman [EMAIL PROTECTED]

---

 kernel/cpuset.c |   53 +
 1 file changed, 21 insertions(+), 32 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -167,7 +167,7 @@ static inline int is_spread_slab(const s
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
  *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach() could
  * reattach a task to a different cpuset, which must not have its
  * generation numbers aliased with those of that tasks previous cpuset.
  *
@@ -218,7 +218,7 @@ static struct cpuset top_cpuset = {
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
+ * zero, then only cpuset_attach(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
@@ -255,18 +255,18 @@ static struct cpuset top_cpuset = {
  *
  * The task_lock() exception
  *
- * The need for this exception arises from the action of attach_task(),
+ * The need for this exception arises from the action of cpuset_attach(),
  * which overwrites one tasks cpuset pointer with another.  It does
  * so using both mutexes, however there are several performance
  * critical places that need to reference task-cpuset without the
  * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
+ * noted below, when dereferencing or, as in cpuset_attach(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task-alloc_lock) already in the task_struct routinely used for
  * such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
+ * update of a tasks cpuset pointer by cpuset_attach() and the
  * access of task-cpuset-mems_generation via that pointer in
  * the routine cpuset_update_task_memory_state().
  */
@@ -368,7 +368,7 @@ static void guarantee_online_mems(const 
  *
  * Reading current-cpuset-mems_generation doesn't need task_lock
  * to guard the current-cpuset derefence, because it is guarded
- * from concurrent freeing of current-cpuset by attach_task(),
+ * from concurrent freeing of current-cpuset by cpuset_attach(),
  * using RCU.
  *
  * The rcu_dereference() is technically probably not needed,
@@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset 
trialcs = *cs;
 
/*
-* An empty cpus_allowed is ok if there are no tasks in the cpuset.
+* An empty cpus_allowed is ok only if the cpuset has no tasks.
 * Since cpulist_parse() fails on an empty mask, we special case
 * that parsing.  The validate_change() call ensures that cpusets
 * with tasks have cpus.
@@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset 
  *so that the migration code can allocate pages on these nodes.
  *
  *Call holding manage_mutex, so our current-cpuset won't change
- *during this call, as manage_mutex holds off any attach_task

[RFC] hotplug cpu move tasks in empty cpusets - possible refinements

2008-01-18 Thread Cliff Wickman


Hi Paul,

> Query for Cliff:
> 1) Can we narrow the scope of callback_mutex in scan_for_empty_cpusets()?
> 2) Can we avoid rewriting the cpus, mems of cpusets except when it is
>likely that we'll be changing them?
> 3) Should not remove_tasks_in_empty_cpuset() also check for empty mems?
> -pj

I agree with all of the above refinements.
And I just tested the below patch and find no problem.

So this is an ACK from me.

-Cliff

---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs->parent;
-   while (cpus_empty(parent->cpus_allowed))
+   while (cpus_empty(parent->cpus_allowed) ||
+   nodes_empty(parent->mems_allowed))
parent = parent->parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)>stack_list, );
 
-   mutex_lock(_mutex);
while (!list_empty()) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(>stack_list, );
}
cont = cp->css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+   nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
-nodes_empty(cp->mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(_mutex);
+nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(_mutex);
-   }
}
-   mutex_unlock(_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC] hotplug cpu move tasks in empty cpusets - possible refinements

2008-01-18 Thread Cliff Wickman


Hi Paul,

 Query for Cliff:
 1) Can we narrow the scope of callback_mutex in scan_for_empty_cpusets()?
 2) Can we avoid rewriting the cpus, mems of cpusets except when it is
likely that we'll be changing them?
 3) Should not remove_tasks_in_empty_cpuset() also check for empty mems?
 -pj

I agree with all of the above refinements.
And I just tested the below patch and find no problem.

So this is an ACK from me.

-Cliff

---

 kernel/cpuset.c |   21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/cpuset.c
===
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset
 * has online cpus, so can't be empty).
 */
parent = cs-parent;
-   while (cpus_empty(parent-cpus_allowed))
+   while (cpus_empty(parent-cpus_allowed) ||
+   nodes_empty(parent-mems_allowed))
parent = parent-parent;
 
move_member_tasks_to_cpuset(cs, parent);
@@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const
 
list_add_tail((struct list_head *)root-stack_list, queue);
 
-   mutex_lock(callback_mutex);
while (!list_empty(queue)) {
cp = container_of(queue.next, struct cpuset, stack_list);
list_del(queue.next);
@@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const
list_add_tail(child-stack_list, queue);
}
cont = cp-css.cgroup;
+
+   /* Continue past cpusets with all cpus, mems online */
+   if (cpus_subset(cp-cpus_allowed, cpu_online_map) 
+   nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY]))
+   continue;
+
/* Remove offline cpus and mems from this cpuset. */
+   mutex_lock(callback_mutex);
cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map);
nodes_and(cp-mems_allowed, cp-mems_allowed,
node_states[N_HIGH_MEMORY]);
+   mutex_unlock(callback_mutex);
+
+   /* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp-cpus_allowed) ||
-nodes_empty(cp-mems_allowed)) {
-   /* Move tasks from the empty cpuset to a parent */
-   mutex_unlock(callback_mutex);
+nodes_empty(cp-mems_allowed))
remove_tasks_in_empty_cpuset(cp);
-   mutex_lock(callback_mutex);
-   }
}
-   mutex_unlock(callback_mutex);
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH][VER 4] mspec: handle shrinking virtual memory areas

2007-09-20 Thread Cliff Wickman

Stress testing revealed the need for (yet more) revision. sorry.

This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch

Version 4: clear/release fetchop pages only when vma_data is no longer shared

The vma_data structure may be shared by vma's from multiple tasks, with
no way of knowing which areas are shared or not shared, so release/clear
pages only when the refcount (of vma's) goes to zero.

Diffed against 2.6.23-rc7

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
---
 drivers/char/mspec.c |   26 --
 1 file changed, 8 insertions(+), 18 deletions(-)

Index: linus.070920/drivers/char/mspec.c
===
--- linus.070920.orig/drivers/char/mspec.c
+++ linus.070920/drivers/char/mspec.c
@@ -155,23 +155,22 @@ mspec_open(struct vm_area_struct *vma)
  * mspec_close
  *
  * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to the vma.
+ * belonging to all the vma's sharing this vma_data structure.
  */
 static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int index, last_index, result;
+   int index, last_index;
unsigned long my_page;
 
vdata = vma->vm_private_data;
 
-   BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end);
+   if (!atomic_dec_and_test(>refcnt))
+   return;
 
-   spin_lock(>lock);
-   index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT;
-   last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT;
-   for (; index < last_index; index++) {
+   last_index = (vdata->vm_end - vdata->vm_start) >> PAGE_SHIFT;
+   for (index=0; index < last_index; index++) {
if (vdata->maddr[index] == 0)
continue;
/*
@@ -180,20 +179,12 @@ mspec_close(struct vm_area_struct *vma)
 */
my_page = vdata->maddr[index];
vdata->maddr[index] = 0;
-   spin_unlock(>lock);
-   result = mspec_zero_block(my_page, PAGE_SIZE);
-   if (!result)
+   if (!mspec_zero_block(my_page, PAGE_SIZE))
uncached_free_page(my_page);
else
printk(KERN_WARNING "mspec_close(): "
-  "failed to zero page %i\n",
-  result);
-   spin_lock(>lock);
+  "failed to zero page %ld\n", my_page);
}
-   spin_unlock(>lock);
-
-   if (!atomic_dec_and_test(>refcnt))
-   return;
 
if (vdata->flags & VMD_VMALLOCED)
vfree(vdata);
@@ -201,7 +192,6 @@ mspec_close(struct vm_area_struct *vma)
kfree(vdata);
 }
 
-
 /*
  * mspec_nopfn
  *
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH][VER 4] mspec: handle shrinking virtual memory areas

2007-09-20 Thread Cliff Wickman

Stress testing revealed the need for more revision.
This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch

Version 4: clear/release fetchop pages only when vma_data is no longer shared

Version 3: single thread the clearing of vma_data maddr[]
Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm->mmap_sem in the fork() case
so the reference count is left as atomic_t.

The vma_data structure may be shared by vma's from multiple tasks, with
no way of knowing which areas are shared or not shared, so release/clear
pages only when the refcount (of vma's) goes to zero.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
Acked-by: Jes Sorensen <[EMAIL PROTECTED]>
---
 drivers/char/mspec.c |   64 ---
 1 file changed, 41 insertions(+), 23 deletions(-)

Index: linus.070912/drivers/char/mspec.c
===
--- linus.070912.orig/drivers/char/mspec.c
+++ linus.070912/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma->vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm->mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
-   spinlock_t lock;/* Serialize access to the vma. */
+   spinlock_t lock;/* Serialize access to this structure. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -145,40 +155,41 @@ mspec_open(struct vm_area_struct *vma)
  * mspec_close
  *
  * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to the vma.
+ * belonging to all the vma's sharing this vma_data structure.
  */
 static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index;
+   unsigned long my_page;
 
vdata = vma->vm_private_data;
+
if (!atomic_dec_and_test(>refcnt))
return;
 
-   pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-   vdata

[PATCH][VER 4] mspec: handle shrinking virtual memory areas

2007-09-20 Thread Cliff Wickman

Stress testing revealed the need for more revision.
This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch

Version 4: clear/release fetchop pages only when vma_data is no longer shared

Version 3: single thread the clearing of vma_data maddr[]
Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm-mmap_sem in the fork() case
so the reference count is left as atomic_t.

The vma_data structure may be shared by vma's from multiple tasks, with
no way of knowing which areas are shared or not shared, so release/clear
pages only when the refcount (of vma's) goes to zero.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
Acked-by: Jes Sorensen [EMAIL PROTECTED]
---
 drivers/char/mspec.c |   64 ---
 1 file changed, 41 insertions(+), 23 deletions(-)

Index: linus.070912/drivers/char/mspec.c
===
--- linus.070912.orig/drivers/char/mspec.c
+++ linus.070912/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma-vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm-mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
-   spinlock_t lock;/* Serialize access to the vma. */
+   spinlock_t lock;/* Serialize access to this structure. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -145,40 +155,41 @@ mspec_open(struct vm_area_struct *vma)
  * mspec_close
  *
  * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to the vma.
+ * belonging to all the vma's sharing this vma_data structure.
  */
 static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index;
+   unsigned long my_page;
 
vdata = vma-vm_private_data;
+
if (!atomic_dec_and_test(vdata-refcnt))
return;
 
-   pages = (vma-vm_end - vma-vm_start)  PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages

[PATCH][VER 4] mspec: handle shrinking virtual memory areas

2007-09-20 Thread Cliff Wickman

Stress testing revealed the need for (yet more) revision. sorry.

This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch

Version 4: clear/release fetchop pages only when vma_data is no longer shared

The vma_data structure may be shared by vma's from multiple tasks, with
no way of knowing which areas are shared or not shared, so release/clear
pages only when the refcount (of vma's) goes to zero.

Diffed against 2.6.23-rc7

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
---
 drivers/char/mspec.c |   26 --
 1 file changed, 8 insertions(+), 18 deletions(-)

Index: linus.070920/drivers/char/mspec.c
===
--- linus.070920.orig/drivers/char/mspec.c
+++ linus.070920/drivers/char/mspec.c
@@ -155,23 +155,22 @@ mspec_open(struct vm_area_struct *vma)
  * mspec_close
  *
  * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to the vma.
+ * belonging to all the vma's sharing this vma_data structure.
  */
 static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int index, last_index, result;
+   int index, last_index;
unsigned long my_page;
 
vdata = vma-vm_private_data;
 
-   BUG_ON(vma-vm_start  vdata-vm_start || vma-vm_end  vdata-vm_end);
+   if (!atomic_dec_and_test(vdata-refcnt))
+   return;
 
-   spin_lock(vdata-lock);
-   index = (vma-vm_start - vdata-vm_start)  PAGE_SHIFT;
-   last_index = (vma-vm_end - vdata-vm_start)  PAGE_SHIFT;
-   for (; index  last_index; index++) {
+   last_index = (vdata-vm_end - vdata-vm_start)  PAGE_SHIFT;
+   for (index=0; index  last_index; index++) {
if (vdata-maddr[index] == 0)
continue;
/*
@@ -180,20 +179,12 @@ mspec_close(struct vm_area_struct *vma)
 */
my_page = vdata-maddr[index];
vdata-maddr[index] = 0;
-   spin_unlock(vdata-lock);
-   result = mspec_zero_block(my_page, PAGE_SIZE);
-   if (!result)
+   if (!mspec_zero_block(my_page, PAGE_SIZE))
uncached_free_page(my_page);
else
printk(KERN_WARNING mspec_close(): 
-  failed to zero page %i\n,
-  result);
-   spin_lock(vdata-lock);
+  failed to zero page %ld\n, my_page);
}
-   spin_unlock(vdata-lock);
-
-   if (!atomic_dec_and_test(vdata-refcnt))
-   return;
 
if (vdata-flags  VMD_VMALLOCED)
vfree(vdata);
@@ -201,7 +192,6 @@ mspec_close(struct vm_area_struct *vma)
kfree(vdata);
 }
 
-
 /*
  * mspec_nopfn
  *
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH][VER 3] mspec: handle shrinking virtual memory areas

2007-09-14 Thread Cliff Wickman

Stress testing revealed the need for more revision:

Version 3: single thread the clearing of vma_data maddr[]

Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm->mmap_sem in the fork() case
so the reference count is left as atomic_t.
Each section of the vma_data structure may be shared by multiple tasks
(forked from the same parent). So single thread mspec_close() during the
zeroing of a vma's section.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
Acked-by: Jes Sorensen <[EMAIL PROTECTED]>
---
-
---
 drivers/char/mspec.c  |   69 +++--

---
 drivers/char/mspec.c |   69 +++
 1 file changed, 48 insertions(+), 21 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma->vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm->mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
-   spinlock_t lock;/* Serialize access to the vma. */
+   spinlock_t lock;/* Serialize access to this structure. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,44 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
+   unsigned long my_page;
 
vdata = vma->vm_private_data;
-   if (!atomic_dec_and_test(>refcnt))
-   return;
 
-   pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i < pages; i++) {
-   if (vdata->maddr[i] == 0)
+   BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end);
+
+   spin_lock(>lock);
+   index = (vma->

[PATCH][VER 3] mspec: handle shrinking virtual memory areas

2007-09-14 Thread Cliff Wickman

Stress testing revealed the need for more revision:

Version 3: single thread the clearing of vma_data maddr[]

Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm-mmap_sem in the fork() case
so the reference count is left as atomic_t.
Each section of the vma_data structure may be shared by multiple tasks
(forked from the same parent). So single thread mspec_close() during the
zeroing of a vma's section.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
Acked-by: Jes Sorensen [EMAIL PROTECTED]
---
-
---
 drivers/char/mspec.c  |   69 +++--

---
 drivers/char/mspec.c |   69 +++
 1 file changed, 48 insertions(+), 21 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma-vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm-mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
-   spinlock_t lock;/* Serialize access to the vma. */
+   spinlock_t lock;/* Serialize access to this structure. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,44 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
+   unsigned long my_page;
 
vdata = vma-vm_private_data;
-   if (!atomic_dec_and_test(vdata-refcnt))
-   return;
 
-   pages = (vma-vm_end - vma-vm_start)  PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i  pages; i++) {
-   if (vdata-maddr[i] == 0)
+   BUG_ON(vma-vm_start  vdata-vm_start || vma-vm_end  vdata-vm_end);
+
+   spin_lock(vdata-lock);
+   index = (vma-vm_start - vdata-vm_start)  PAGE_SHIFT;
+   last_index = (vma-vm_end - vdata

[PATCH][REVISED] mspec: handle shrinking virtual memory areas

2007-09-12 Thread Cliff Wickman

Version 2: refcount maintained as atomic_t (as before the version 1 patch)
   (Diffed against 2.6.23-rc5, not "2.6.13-rc5" !)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm->mmap_sem in the fork() case
so the reference count is left as atomic_t.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
Acked-by: Jes Sorensen <[EMAIL PROTECTED]>
-
---
 drivers/char/mspec.c |   61 ++-
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma->vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm->mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma->vm_private_data;
-   if (!atomic_dec_and_test(>refcnt))
-   return;
 
-   pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i < pages; i++) {
-   if (vdata->maddr[i] == 0)
+   BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end);
+
+   index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT;
+   last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT;
+   for (; index < last_index; index++) {
+   if (vdata->maddr[index] == 0)
continue;
/*
 * Clear the page before sticking it back
 * into the pool.
 */
-   result = mspec_zero_bloc

[PATCH][REVISED] mspec: handle shrinking virtual memory areas

2007-09-12 Thread Cliff Wickman

Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm->mmap_sem in the fork() case
so the reference count is left as atomic_t.

Diffed against 2.6.13-rc5

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
Acked-by: Jes Sorensen <[EMAIL PROTECTED]>
-
---
 drivers/char/mspec.c |   61 ++-
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma->vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm->mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma->vm_private_data;
-   if (!atomic_dec_and_test(>refcnt))
-   return;
 
-   pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i < pages; i++) {
-   if (vdata->maddr[i] == 0)
+   BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end);
+
+   index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT;
+   last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT;
+   for (; index < last_index; index++) {
+   if (vdata->maddr[index] == 0)
continue;
/*
 * Clear the page before sticking it back
 * into the pool.
 */
-   result = mspec_zero_block(vdata->maddr[i], PAGE_SIZE);
+ 

[PATCH][REVISED] mspec: handle shrinking virtual memory areas

2007-09-12 Thread Cliff Wickman

Version 2: refcount maintained as atomic_t (as before the version 1 patch)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm-mmap_sem in the fork() case
so the reference count is left as atomic_t.

Diffed against 2.6.13-rc5

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
Acked-by: Jes Sorensen [EMAIL PROTECTED]
-
---
 drivers/char/mspec.c |   61 ++-
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma-vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm-mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma-vm_private_data;
-   if (!atomic_dec_and_test(vdata-refcnt))
-   return;
 
-   pages = (vma-vm_end - vma-vm_start)  PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i  pages; i++) {
-   if (vdata-maddr[i] == 0)
+   BUG_ON(vma-vm_start  vdata-vm_start || vma-vm_end  vdata-vm_end);
+
+   index = (vma-vm_start - vdata-vm_start)  PAGE_SHIFT;
+   last_index = (vma-vm_end - vdata-vm_start)  PAGE_SHIFT;
+   for (; index  last_index; index++) {
+   if (vdata-maddr[index] == 0)
continue;
/*
 * Clear the page before sticking it back
 * into the pool.
 */
-   result = mspec_zero_block(vdata-maddr[i], PAGE_SIZE);
+   result = mspec_zero_block(vdata-maddr[index], PAGE_SIZE);
if (!result)
-   uncached_free_page(vdata-maddr[i

[PATCH][REVISED] mspec: handle shrinking virtual memory areas

2007-09-12 Thread Cliff Wickman

Version 2: refcount maintained as atomic_t (as before the version 1 patch)
   (Diffed against 2.6.23-rc5, not 2.6.13-rc5 !)

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The shared vma_data's are not protected by mm-mmap_sem in the fork() case
so the reference count is left as atomic_t.

Diffed against 2.6.23-rc5

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
Acked-by: Jes Sorensen [EMAIL PROTECTED]
-
---
 drivers/char/mspec.c |   61 ++-
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,25 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma-vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented atomically because mm-mmap_sem does not
+ * protect in fork case where multiple tasks share the vma_data.
  */
 struct vma_data {
atomic_t refcnt;/* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -151,34 +161,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma-vm_private_data;
-   if (!atomic_dec_and_test(vdata-refcnt))
-   return;
 
-   pages = (vma-vm_end - vma-vm_start)  PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i  pages; i++) {
-   if (vdata-maddr[i] == 0)
+   BUG_ON(vma-vm_start  vdata-vm_start || vma-vm_end  vdata-vm_end);
+
+   index = (vma-vm_start - vdata-vm_start)  PAGE_SHIFT;
+   last_index = (vma-vm_end - vdata-vm_start)  PAGE_SHIFT;
+   for (; index  last_index; index++) {
+   if (vdata-maddr[index] == 0)
continue;
/*
 * Clear the page before sticking it back
 * into the pool.
 */
-   result = mspec_zero_block(vdata-maddr[i], PAGE_SIZE);
+   result = mspec_zero_block(vdata-maddr[index], PAGE_SIZE);
if (!result

[PATCH 1] mspec: handle shrinking virtual memory areas

2007-09-07 Thread Cliff Wickman

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The vma's are protected by mm->mmap_sem, so the reference count was changed
from an atomic_t to an int.

Diffed against 2.6.13-rc5

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
Acked-by: Jes Sorensen <[EMAIL PROTECTED]>
-
---
 drivers/char/mspec.c |   68 +--
 1 file changed, 45 insertions(+), 23 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,26 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma->vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented non-atomically because all paths leading
+ * to mspec_open() and mspec_close() are single threaded by the exclusive
+ * locking of mm->mmap_sem.
  */
 struct vma_data {
-   atomic_t refcnt;/* Number of vmas sharing the data. */
+   int refcnt; /* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +140,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -138,7 +149,7 @@ mspec_open(struct vm_area_struct *vma)
struct vma_data *vdata;
 
vdata = vma->vm_private_data;
-   atomic_inc(>refcnt);
+   vdata->refcnt++;
 }
 
 /*
@@ -151,34 +162,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma->vm_private_data;
-   if (!atomic_dec_and_test(>refcnt))
-   return;
 
-   pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i < pages; i++) {
-   if (vdata->maddr[i] == 0)
+   BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end);
+
+   index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT;
+   last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT;
+   for (; index < last_index; index++) {
+   if (vdata->maddr[index] == 0)
continue;
   

[PATCH 1] mspec: handle shrinking virtual memory areas

2007-09-07 Thread Cliff Wickman

The shrinking of a virtual memory area that is mmap(2)'d to a memory
special file (device drivers/char/mspec.c) can cause a panic.

If the mapped size of the vma (vm_area_struct) is very large, mspec allocates
a large vma_data structure with vmalloc(). But such a vma can be shrunk by
an munmap(2).  The current driver uses the current size of each vma to
deduce whether its vma_data structure was allocated by kmalloc() or vmalloc().
So if the vma was shrunk it appears to have been allocated by kmalloc(),
and mspec attempts to free it with kfree().  This results in a panic.

This patch avoids the panic (by preserving the type of the allocation) and
also makes mspec work correctly as the vma is split into pieces by the
munmap(2)'s.

All vma's derived from such a split vma share the same vma_data structure that
represents all the pages mapped into this set of vma's.  The mpec driver
must be made capable of using the right portion of the structure for each
member vma.  In other words, it must index into the array of page addresses
using the portion of the array that represents the current vma. This is
enabled by storing the vma group's vm_start in the vma_data structure.

The vma's are protected by mm-mmap_sem, so the reference count was changed
from an atomic_t to an int.

Diffed against 2.6.13-rc5

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]
Acked-by: Jes Sorensen [EMAIL PROTECTED]
-
---
 drivers/char/mspec.c |   68 +--
 1 file changed, 45 insertions(+), 23 deletions(-)

Index: mspec_community/drivers/char/mspec.c
===
--- mspec_community.orig/drivers/char/mspec.c
+++ mspec_community/drivers/char/mspec.c
@@ -67,7 +67,7 @@
 /*
  * Page types allocated by the device.
  */
-enum {
+enum mspec_page_type {
MSPEC_FETCHOP = 1,
MSPEC_CACHED,
MSPEC_UNCACHED
@@ -83,15 +83,26 @@ static int is_sn2;
  * One of these structures is allocated when an mspec region is mmaped. The
  * structure is pointed to by the vma-vm_private_data field in the vma struct.
  * This structure is used to record the addresses of the mspec pages.
+ * This structure is shared by all vma's that are split off from the
+ * original vma when split_vma()'s are done.
+ *
+ * The refcnt is incremented non-atomically because all paths leading
+ * to mspec_open() and mspec_close() are single threaded by the exclusive
+ * locking of mm-mmap_sem.
  */
 struct vma_data {
-   atomic_t refcnt;/* Number of vmas sharing the data. */
+   int refcnt; /* Number of vmas sharing the data. */
spinlock_t lock;/* Serialize access to the vma. */
int count;  /* Number of pages allocated. */
-   int type;   /* Type of pages allocated. */
+   enum mspec_page_type type; /* Type of pages allocated. */
+   int flags;  /* See VMD_xxx below. */
+   unsigned long vm_start; /* Original (unsplit) base. */
+   unsigned long vm_end;   /* Original (unsplit) end. */
unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
+#define VMD_VMALLOCED 0x1  /* vmalloc'd rather than kmalloc'd */
+
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -129,8 +140,8 @@ mspec_zero_block(unsigned long addr, int
  * mspec_open
  *
  * Called when a device mapping is created by a means other than mmap
- * (via fork, etc.).  Increments the reference count on the underlying
- * mspec data so it is not freed prematurely.
+ * (via fork, munmap, etc.).  Increments the reference count on the
+ * underlying mspec data so it is not freed prematurely.
  */
 static void
 mspec_open(struct vm_area_struct *vma)
@@ -138,7 +149,7 @@ mspec_open(struct vm_area_struct *vma)
struct vma_data *vdata;
 
vdata = vma-vm_private_data;
-   atomic_inc(vdata-refcnt);
+   vdata-refcnt++;
 }
 
 /*
@@ -151,34 +162,38 @@ static void
 mspec_close(struct vm_area_struct *vma)
 {
struct vma_data *vdata;
-   int i, pages, result, vdata_size;
+   int index, last_index, result;
 
vdata = vma-vm_private_data;
-   if (!atomic_dec_and_test(vdata-refcnt))
-   return;
 
-   pages = (vma-vm_end - vma-vm_start)  PAGE_SHIFT;
-   vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-   for (i = 0; i  pages; i++) {
-   if (vdata-maddr[i] == 0)
+   BUG_ON(vma-vm_start  vdata-vm_start || vma-vm_end  vdata-vm_end);
+
+   index = (vma-vm_start - vdata-vm_start)  PAGE_SHIFT;
+   last_index = (vma-vm_end - vdata-vm_start)  PAGE_SHIFT;
+   for (; index  last_index; index++) {
+   if (vdata-maddr[index] == 0)
continue;
/*
 * Clear the page before sticking it back
 * into the pool

[PATCH 1/1] hotplug cpu: documentation addition to downing a cpu

2007-08-27 Thread Cliff Wickman

In answer to Andrew:
> How do we communicate this new design/feature to our users?
> Documentation/cpusets.txt, perhaps?  Documentation/cpu-hotplug.txt?
> git-log?  ;)

Patch "[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset" may
warrant an addition to the documentation.  I would propose this note
in cpu-hotplug.txt.

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 Documentation/cpu-hotplug.txt |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linus.070821/Documentation/cpu-hotplug.txt
===
--- linus.070821.orig/Documentation/cpu-hotplug.txt
+++ linus.070821/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no pa
   CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
   CPU is being offlined while tasks are frozen due to a suspend operation in
   progress
-- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+  The new CPU is chosen from each process' current cpuset, which may be
+  a subset of all online CPUs.
 - All interrupts targeted to this CPU is migrated to a new CPU
 - timers/bottom half/task lets are also migrated to a new CPU
 - Once all services are migrated, kernel calls an arch specific routine
-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset

2007-08-27 Thread Cliff Wickman

Version 4 calls cpuset_cpus_allowed_locked() outside of the
task_rq_lock to avoid the lock conflict that Oleg noticed.
And it consolidates what would have been very similar "if (dest_cpu == NR_CPUS)"
paths.  Also Oleg's observation.


Version 3 added a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found)

There was discussion about this patch among:
Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell
regarding other approaches:
  refusing to offline a cpu with tasks pinned to it, or
  providing an administrator the ability to assign such tasks to other cpus

There is indeed an "assumption" in my patch that the cpuset containing a
pinned task's cpu is a better choice than any online cpu. I think that is
a reasonable assumption given the typical construction of a cpuset and the
reason a task is running in a cpuset.

And there will be coming cases, at least on some architectures, where a
cpu is offlined as a kernel reaction to a hardware error.  In that case
would it not be preferrable to re-pin such tasks and let them proceed?



When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that
will not block. (name change - per Oleg's suggestion)

Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   15 ++-
 kernel/sched.c |   13 -
 3 files changed, 31 insertions(+), 2 deletions(-)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -5093,8 +5094,16 @@ restart:
 
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
+   cpumask_t cpus_allowed = cpuset_cpus_allowed_locked();
+   /*
+* Try to stay on the same cpuset, where the current cpuset
+* may be a subset of all cpus.
+* The cpuset_cpus_allowed_locked() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
rq = task_rq_lock(p, );
-   cpus_setall(p->cpus_allowed);
+   p->cpus_allowed = cpus_allowed;
dest_cpu = any_online_cpu(p->cpus_allowed);
task_rq_unlock(rq, );
 
@@ -5412,6 +5421,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5425,6 +5435,7 @@ migration_call(struct notifier_block *nf
rq->idle->sched_class = _sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(>lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void);
 extern void cpuset_for

[PATCH 1/1] V3: hotplug cpu: migrate a task within its cpuset

2007-08-27 Thread Cliff Wickman

Version 3 adds a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found)

There was discussion about this patch among:
Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell
regarding other approaches:
  refusing to offline a cpu with tasks pinned to it, or
  providing an administrator the ability to assign such tasks to other cpus

There is indeed an "assumption" in my patch that the cpuset containing a
pinned task's cpu is a better choice than any online cpu. I think that is
a reasonable assumption given the typical construction of a cpuset and the
reason a task is running in a cpuset.

And there will be coming cases, at least on some architectures, where a
cpu is offlined as a kernel reaction to a hardware error.  In that case
would it not be preferrable to re-pin such tasks and let them proceed?



When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that
will not block. (name change - per Oleg's suggestion)

Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   15 ++-
 kernel/sched.c |   16 
 3 files changed, 35 insertions(+), 1 deletion(-)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -5091,6 +5092,19 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p->cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   rq = task_rq_lock(p, );
+   /*
+* The cpuset_cpus_allowed_locked() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
+   p->cpus_allowed = cpuset_cpus_allowed_locked(p);
+   dest_cpu = any_online_cpu(p->cpus_allowed);
+   task_rq_unlock(rq, );
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, );
@@ -5412,6 +5426,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5425,6 +5440,7 @@ migration_call(struct notifier_block *nf
rq->idle->sched_class = _sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(>lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
 extern nodemask_t cpuset_mems_

[PATCH 1/1] V3: hotplug cpu: migrate a task within its cpuset

2007-08-27 Thread Cliff Wickman

Version 3 adds a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found)

There was discussion about this patch among:
Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell
regarding other approaches:
  refusing to offline a cpu with tasks pinned to it, or
  providing an administrator the ability to assign such tasks to other cpus

There is indeed an assumption in my patch that the cpuset containing a
pinned task's cpu is a better choice than any online cpu. I think that is
a reasonable assumption given the typical construction of a cpuset and the
reason a task is running in a cpuset.

And there will be coming cases, at least on some architectures, where a
cpu is offlined as a kernel reaction to a hardware error.  In that case
would it not be preferrable to re-pin such tasks and let them proceed?



When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that
will not block. (name change - per Oleg's suggestion)

Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   15 ++-
 kernel/sched.c |   16 
 3 files changed, 35 insertions(+), 1 deletion(-)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include linux/delayacct.h
 #include linux/reciprocal_div.h
 #include linux/unistd.h
+#include linux/cpuset.h
 
 #include asm/tlb.h
 
@@ -5091,6 +5092,19 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p-cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   rq = task_rq_lock(p, flags);
+   /*
+* The cpuset_cpus_allowed_locked() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
+   p-cpus_allowed = cpuset_cpus_allowed_locked(p);
+   dest_cpu = any_online_cpu(p-cpus_allowed);
+   task_rq_unlock(rq, flags);
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, flags);
@@ -5412,6 +5426,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq-migration_thread);
@@ -5425,6 +5440,7 @@ migration_call(struct notifier_block *nf
rq-idle-sched_class = idle_sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(rq-lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq-nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p

[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset

2007-08-27 Thread Cliff Wickman

Version 4 calls cpuset_cpus_allowed_locked() outside of the
task_rq_lock to avoid the lock conflict that Oleg noticed.
And it consolidates what would have been very similar if (dest_cpu == NR_CPUS)
paths.  Also Oleg's observation.


Version 3 added a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found)

There was discussion about this patch among:
Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell
regarding other approaches:
  refusing to offline a cpu with tasks pinned to it, or
  providing an administrator the ability to assign such tasks to other cpus

There is indeed an assumption in my patch that the cpuset containing a
pinned task's cpu is a better choice than any online cpu. I think that is
a reasonable assumption given the typical construction of a cpuset and the
reason a task is running in a cpuset.

And there will be coming cases, at least on some architectures, where a
cpu is offlined as a kernel reaction to a hardware error.  In that case
would it not be preferrable to re-pin such tasks and let them proceed?



When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that
will not block. (name change - per Oleg's suggestion)

Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   15 ++-
 kernel/sched.c |   13 -
 3 files changed, 31 insertions(+), 2 deletions(-)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include linux/delayacct.h
 #include linux/reciprocal_div.h
 #include linux/unistd.h
+#include linux/cpuset.h
 
 #include asm/tlb.h
 
@@ -5093,8 +5094,16 @@ restart:
 
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
+   cpumask_t cpus_allowed = cpuset_cpus_allowed_locked();
+   /*
+* Try to stay on the same cpuset, where the current cpuset
+* may be a subset of all cpus.
+* The cpuset_cpus_allowed_locked() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
rq = task_rq_lock(p, flags);
-   cpus_setall(p-cpus_allowed);
+   p-cpus_allowed = cpus_allowed;
dest_cpu = any_online_cpu(p-cpus_allowed);
task_rq_unlock(rq, flags);
 
@@ -5412,6 +5421,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq-migration_thread);
@@ -5425,6 +5435,7 @@ migration_call(struct notifier_block *nf
rq-idle-sched_class = idle_sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(rq-lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq-nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void

[PATCH 1/1] hotplug cpu: documentation addition to downing a cpu

2007-08-27 Thread Cliff Wickman

In answer to Andrew:
 How do we communicate this new design/feature to our users?
 Documentation/cpusets.txt, perhaps?  Documentation/cpu-hotplug.txt?
 git-log?  ;)

Patch [PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset may
warrant an addition to the documentation.  I would propose this note
in cpu-hotplug.txt.

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 Documentation/cpu-hotplug.txt |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linus.070821/Documentation/cpu-hotplug.txt
===
--- linus.070821.orig/Documentation/cpu-hotplug.txt
+++ linus.070821/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no pa
   CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
   CPU is being offlined while tasks are frozen due to a suspend operation in
   progress
-- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+  The new CPU is chosen from each process' current cpuset, which may be
+  a subset of all online CPUs.
 - All interrupts targeted to this CPU is migrated to a new CPU
 - timers/bottom half/task lets are also migrated to a new CPU
 - Once all services are migrated, kernel calls an arch specific routine
-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-08-24 Thread Cliff Wickman

When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_lock(), a new variant of cpuset_cpus_allowed() that
will not block.
Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   19 +++
 kernel/sched.c |   14 ++
 3 files changed, 38 insertions(+)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -5091,6 +5092,17 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p->cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   /*
+* The cpuset_cpus_allowed_lock() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
+   p->cpus_allowed = cpuset_cpus_allowed_lock(p);
+   dest_cpu = any_online_cpu(p->cpus_allowed);
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, );
@@ -5412,6 +5424,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5425,6 +5438,7 @@ migration_call(struct notifier_block *nf
rq->idle->sched_class = _sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(>lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -87,6 +88,10 @@ static inline cpumask_t cpuset_cpus_allo
 {
return cpu_possible_map;
 }
+static inline cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p)
+{
+   return cpu_possible_map;
+}
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
Index: linus.070821/kernel/cpuset.c
===
--- linus.070821.orig/kernel/cpuset.c
+++ linus.070821/kernel/cpuset.c
@@ -2333,6 +2333,25 @@ cpumask_t cpuset_cpus_allowed(struct tas
return mask;
 }
 
+/**
+ * cpuset_cpus_allowed_lock - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ *
+ * Description: Same as cpuset_cpus_allowed, but called with callback_mutex
+ * already held.
+ **/
+
+cpumask_t cpuset_cpus_

[PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-08-24 Thread Cliff Wickman

When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch would insert a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed_lock(), a new variant of cpuset_cpus_allowed() that
will not block.
Calls are made to cpuset_lock() and cpuset_unlock() in migration_call()
to set the cpuset mutex during the whole migrate_live_tasks() and
migrate_dead_tasks() procedure.

This patch depends on 2 patches from Oleg Nesterov:
  [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of 
write_lock_irq(tasklist)
  [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of 
task_rq_lock()

Diffed against 2.6.23-rc3

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 include/linux/cpuset.h |5 +
 kernel/cpuset.c|   19 +++
 kernel/sched.c |   14 ++
 3 files changed, 38 insertions(+)

Index: linus.070821/kernel/sched.c
===
--- linus.070821.orig/kernel/sched.c
+++ linus.070821/kernel/sched.c
@@ -61,6 +61,7 @@
 #include linux/delayacct.h
 #include linux/reciprocal_div.h
 #include linux/unistd.h
+#include linux/cpuset.h
 
 #include asm/tlb.h
 
@@ -5091,6 +5092,17 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p-cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   /*
+* The cpuset_cpus_allowed_lock() variant of
+* cpuset_cpus_allowed() will not block
+* It must be called within calls to cpuset_lock/cpuset_unlock.
+*/
+   p-cpus_allowed = cpuset_cpus_allowed_lock(p);
+   dest_cpu = any_online_cpu(p-cpus_allowed);
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, flags);
@@ -5412,6 +5424,7 @@ migration_call(struct notifier_block *nf
 
case CPU_DEAD:
case CPU_DEAD_FROZEN:
+   cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq-migration_thread);
@@ -5425,6 +5438,7 @@ migration_call(struct notifier_block *nf
rq-idle-sched_class = idle_sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(rq-lock);
+   cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq-nr_running != 0);
 
Index: linus.070821/include/linux/cpuset.h
===
--- linus.070821.orig/include/linux/cpuset.h
+++ linus.070821/include/linux/cpuset.h
@@ -22,6 +22,7 @@ extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current-mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -87,6 +88,10 @@ static inline cpumask_t cpuset_cpus_allo
 {
return cpu_possible_map;
 }
+static inline cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p)
+{
+   return cpu_possible_map;
+}
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
Index: linus.070821/kernel/cpuset.c
===
--- linus.070821.orig/kernel/cpuset.c
+++ linus.070821/kernel/cpuset.c
@@ -2333,6 +2333,25 @@ cpumask_t cpuset_cpus_allowed(struct tas
return mask;
 }
 
+/**
+ * cpuset_cpus_allowed_lock - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset-cpus_allowed.
+ *
+ * Description: Same as cpuset_cpus_allowed, but called with callback_mutex

ATA scsi driver misbehavior under kdump capture kernel

2007-07-27 Thread Cliff Wickman



I've run into a problem with the ATA SCSI disk driver when running in a
kdump dump-capture kernel.

I'm running on 2-processor x86_64 box.  It has 2 scsi disks, /dev/sda and
/dev/sdb

My kernel is 2.6.22, and built to be a dump capturing kernel loaded by kexec.
When I boot this kernel by itself, it finds both sda and sdb.

But when it is loaded by kexec and booted on a panic it only finds sda.

Any ideas from those familiar with the ATA driver?


-Cliff Wickman
 SGI



I put some printk's into it and get this:

Standalone:

   [nv_adma_error_handler]
cpw: ata_host_register probe port 1 (error_handler:81348625)
cpw: ata_host_register call ata_port_probe
cpw: ata_host_register call ata_port_schedule
cpw: ata_host_register call ata_port_wait_eh
cpw: ata_port_wait_eh entered
cpw: ata_port_wait_eh, preparing to wait
ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
cpw: ata_dev_configure entered
cpw: ata_dev_configure testing class
cpw: ata_dev_configure class is ATA_DEV_ATA
ata2.00: ATA-6: ST3200822AS, 3.01, max UDMA/133
ata2.00: 390721968 sectors, multi 16: LBA48
cpw: ata_dev_configure exiting
cpw: ata_dev_configure entered
cpw: ata_dev_configure testing class
cpw: ata_dev_configure class is ATA_DEV_ATA
cpw: ata_dev_configure exiting
cpw: ata_dev_set_mode printing:
ata2.00: configured for UDMA/133
cpw: ata_port_wait_eh, finished wait
cpw: ata_port_wait_eh exiting
cpw: ata_host_register done with probe port 1


When loaded with kexec and booted on a panic:

cpw: ata_host_register probe port 1 (error_handler:81348625)
cpw: ata_host_register call ata_port_probe
cpw: ata_host_register call ata_port_schedule
cpw: ata_host_register call ata_port_wait_eh
cpw: ata_port_wait_eh entered
cpw: ata_port_wait_eh, preparing to wait
ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
cpw: ata_port_wait_eh, finished wait
cpw: ata_port_wait_eh exiting
cpw: ata_host_register done with probe port 1

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


ATA scsi driver misbehavior under kdump capture kernel

2007-07-27 Thread Cliff Wickman



I've run into a problem with the ATA SCSI disk driver when running in a
kdump dump-capture kernel.

I'm running on 2-processor x86_64 box.  It has 2 scsi disks, /dev/sda and
/dev/sdb

My kernel is 2.6.22, and built to be a dump capturing kernel loaded by kexec.
When I boot this kernel by itself, it finds both sda and sdb.

But when it is loaded by kexec and booted on a panic it only finds sda.

Any ideas from those familiar with the ATA driver?


-Cliff Wickman
 SGI



I put some printk's into it and get this:

Standalone:

   [nv_adma_error_handler]
cpw: ata_host_register probe port 1 (error_handler:81348625)
cpw: ata_host_register call ata_port_probe
cpw: ata_host_register call ata_port_schedule
cpw: ata_host_register call ata_port_wait_eh
cpw: ata_port_wait_eh entered
cpw: ata_port_wait_eh, preparing to wait
ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
cpw: ata_dev_configure entered
cpw: ata_dev_configure testing class
cpw: ata_dev_configure class is ATA_DEV_ATA
ata2.00: ATA-6: ST3200822AS, 3.01, max UDMA/133
ata2.00: 390721968 sectors, multi 16: LBA48
cpw: ata_dev_configure exiting
cpw: ata_dev_configure entered
cpw: ata_dev_configure testing class
cpw: ata_dev_configure class is ATA_DEV_ATA
cpw: ata_dev_configure exiting
cpw: ata_dev_set_mode printing:
ata2.00: configured for UDMA/133
cpw: ata_port_wait_eh, finished wait
cpw: ata_port_wait_eh exiting
cpw: ata_host_register done with probe port 1


When loaded with kexec and booted on a panic:

cpw: ata_host_register probe port 1 (error_handler:81348625)
cpw: ata_host_register call ata_port_probe
cpw: ata_host_register call ata_port_schedule
cpw: ata_host_register call ata_port_wait_eh
cpw: ata_port_wait_eh entered
cpw: ata_port_wait_eh, preparing to wait
ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
cpw: ata_port_wait_eh, finished wait
cpw: ata_port_wait_eh exiting
cpw: ata_host_register done with probe port 1

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-05-23 Thread Cliff Wickman

On Thu, May 24, 2007 at 01:29:02AM +0400, Oleg Nesterov wrote:
> Cliff Wickman wrote:
> >
> > In order to do this, move_task_off_dead_cpu() must make a call to
> > cpuset_cpus_allowed(), which may block.
> >
> > move_task_off_dead_cpu() has been within a critical region when called
> > from migrate_live_tasks().  So this patch also changes migrate_live_tasks()
> > to enable interrupts before calling move_task_off_dead_cpu().
> > Since the tasklist_lock is dropped, the list scan must be restarted from
> > the top.
> >
> > [... snip ...]
> >
> > - * NOTE: interrupts should be disabled by the caller
> > + * NOTE: interrupts are not disabled by the caller
> >   */
> >  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
> >  {
> > @@ -5008,6 +5008,17 @@ restart:
> > if (dest_cpu == NR_CPUS)
> > dest_cpu = any_online_cpu(p->cpus_allowed);
> >
> > +   /* try to stay on the same cpuset */
> > +   if (dest_cpu == NR_CPUS) {
> > +   /*
> > +* Call to cpuset_cpus_allowed may sleep, so we depend
> > +* on move_task_off_dead_cpu() being called in a non-critical
> > +* region.
> > +*/
> > +   p->cpus_allowed = cpuset_cpus_allowed(p);
> > +   dest_cpu = any_online_cpu(p->cpus_allowed);
> > +   }
> 
> I know nothing about cpuset.c, a _very_ naive question.

Paul Jackson is the cpuset guru.
 
> Do we really need task_lock() (used by cpuset_cpus_allowed) here ?

According to Paul's comment in kernel/cpuset.c
 * It is ok to first take manage_sem, then nest callback_sem.  We also
 * require taking task_lock() when dereferencing a tasks cpuset pointer.
So I'm afraid it is not safe to call guarantee_online_cpus(tsk->cpuset, );
without it.  Could the task not be exiting?

> If not, probably we can make this simpler. CPU_DEAD takes cpuset_lock(),
> move_task_off_dead_cpu() uses guarantee_online_cpus() which doesn't sleep,
> so we don't need other changes.
> 
> Possible?
> 
> If not, this patch should also change migrate_dead(), it still calls
> move_task_off_dead_cpu() with irqs disabled, no?

Right, the lock is released but I indeed didn't reenable irqs.
How would you suggest doing that?  The irq state was saved in local
variable "flags" back in migration_call().

> 
> Oleg.

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-05-23 Thread Cliff Wickman

On Thu, May 24, 2007 at 01:29:02AM +0400, Oleg Nesterov wrote:
 Cliff Wickman wrote:
 
  In order to do this, move_task_off_dead_cpu() must make a call to
  cpuset_cpus_allowed(), which may block.
 
  move_task_off_dead_cpu() has been within a critical region when called
  from migrate_live_tasks().  So this patch also changes migrate_live_tasks()
  to enable interrupts before calling move_task_off_dead_cpu().
  Since the tasklist_lock is dropped, the list scan must be restarted from
  the top.
 
  [... snip ...]
 
  - * NOTE: interrupts should be disabled by the caller
  + * NOTE: interrupts are not disabled by the caller
*/
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
  @@ -5008,6 +5008,17 @@ restart:
  if (dest_cpu == NR_CPUS)
  dest_cpu = any_online_cpu(p-cpus_allowed);
 
  +   /* try to stay on the same cpuset */
  +   if (dest_cpu == NR_CPUS) {
  +   /*
  +* Call to cpuset_cpus_allowed may sleep, so we depend
  +* on move_task_off_dead_cpu() being called in a non-critical
  +* region.
  +*/
  +   p-cpus_allowed = cpuset_cpus_allowed(p);
  +   dest_cpu = any_online_cpu(p-cpus_allowed);
  +   }
 
 I know nothing about cpuset.c, a _very_ naive question.

Paul Jackson is the cpuset guru.
 
 Do we really need task_lock() (used by cpuset_cpus_allowed) here ?

According to Paul's comment in kernel/cpuset.c
 * It is ok to first take manage_sem, then nest callback_sem.  We also
 * require taking task_lock() when dereferencing a tasks cpuset pointer.
So I'm afraid it is not safe to call guarantee_online_cpus(tsk-cpuset, mask);
without it.  Could the task not be exiting?

 If not, probably we can make this simpler. CPU_DEAD takes cpuset_lock(),
 move_task_off_dead_cpu() uses guarantee_online_cpus() which doesn't sleep,
 so we don't need other changes.
 
 Possible?
 
 If not, this patch should also change migrate_dead(), it still calls
 move_task_off_dead_cpu() with irqs disabled, no?

Right, the lock is released but I indeed didn't reenable irqs.
How would you suggest doing that?  The irq state was saved in local
variable flags back in migration_call().

 
 Oleg.

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-05-21 Thread Cliff Wickman


(this is a third submission -- corrects a locking/blocking issue pointed
 out by Nathan Lynch)

When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch inserts a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed(), which may block.

move_task_off_dead_cpu() has been within a critical region when called
from migrate_live_tasks().  So this patch also changes migrate_live_tasks()
to enable interrupts before calling move_task_off_dead_cpu().
Since the tasklist_lock is dropped, the list scan must be restarted from
the top.
It locks the migrating task by bumping its usage count.
It disables interrupts in move_task_off_dead_cpu() before the
 call to __migrate_task().

This is the outline of the locking surrounding calls to
move_task_off_dead_cpu(), after applying this patch:

  migration_call()
  | case CPU_DEAD
  |   migrate_live_tasks(cpu)
  |   | recheck:
  |   | write_lock_irq(_lock)
  |   | do_each_thread(t, p) {
  |   | if (task_cpu(p) == src_cpu)
  |   | get_task_struct(p)
  |   | write_unlock_irq(_lock)
  |   | move_task_off_dead_cpu(src_cpu, p) <<<< noncritical
  |   | put_task_struct(p);
  |   | goto recheck
  |   | } while_each_thread(t, p)
  |   | write_unlock_irq(_lock)
  |
  |   rq = task_rq_lock(rq->idle, )
  |
  |   migrate_dead_tasks(cpu)
  |   | for (arr = 0; arr < 2; arr++) {
  |   |   for (i = 0; i < MAX_PRIO; i++) {
  |   | while (!list_empty(list))
  |   |   migrate_dead(dead_cpu
  |   | get_task_struct(p)
  |   | spin_unlock_irq(>lock)
  |   | move_task_off_dead_cpu(dead_cpu, p)<<<< noncritcal
  |   | spin_lock_irq(>lock)
  |   | put_task_struct(p)
  |
  |   task_rq_unlock(rq, )

[Side note: a task may be migrated off of its cpuset, but is still attached to
 that cpuset (by pointer and reference count).  The cpuset will not be
 released.  This patch does not change that.]

Diffed against 2.6.21

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

 kernel/sched.c |   31 ---
 1 file changed, 28 insertions(+), 3 deletions(-)

Index: linus.070504/kernel/sched.c
===
--- linus.070504.orig/kernel/sched.c
+++ linus.070504/kernel/sched.c
@@ -4989,7 +4989,7 @@ wait_to_die:
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if neccessary.
- * NOTE: interrupts should be disabled by the caller
+ * NOTE: interrupts are not disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -5008,6 +5008,17 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p->cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   /*
+* Call to cpuset_cpus_allowed may sleep, so we depend
+* on move_task_off_dead_cpu() being called in a non-critical
+* region.
+*/
+   p->cpus_allowed = cpuset_cpus_allowed(p);
+   dest_cpu = any_online_cpu(p->cpus_allowed);
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, );
@@ -5025,8 +5036,16 @@ restart:
   "longer affine to cpu%d\n",
   p->pid, p->comm, dead_cpu);
}
-   if (!__migrate_task(p, dead_cpu, dest_cpu))
+   /*
+* __migrate_task() requires interrupts to be disabled
+*/
+   local_irq_disable();
+   if (!__migrate_task(p, dead_cpu, dest_cpu)) {
+   local_irq_enable();
goto restart;
+   }
+   local_irq_enable();
+   return;
 }
 
 /*
@@ -5054,14 +5073,20 @@ static void migrate_live_tasks(int src_c
 {
   

[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent

2007-05-21 Thread Cliff Wickman


This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.

At that point, any tasks in that cpuset were incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
 of its parent, which may overlap its exclusive sibling.)

Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.

And the empty cpuset should be removed (if it is flagged notify_on_release).

This patch uses a workqueue thread to call the function that deletes the cpuset.
That way we avoid the complexity of the cpuset locks.

(I've been working with Paul Jackson on this patch, and there is still a
 little functional subtlety to work out. Can be tweaked later.)

Diffed against 2.6.21

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  221 
 1 file changed, 191 insertions(+), 30 deletions(-)

Index: linus.070504/kernel/cpuset.c
===
--- linus.070504.orig/kernel/cpuset.c
+++ linus.070504/kernel/cpuset.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CPUSET_SUPER_MAGIC 0x27e0eb
 
@@ -111,6 +112,7 @@ typedef enum {
CS_NOTIFY_ON_RELEASE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+   CS_RELEASED_RESOURCE,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -149,6 +151,11 @@ static inline int is_spread_slab(const s
return test_bit(CS_SPREAD_SLAB, >flags);
 }
 
+static inline int has_released_a_resource(const struct cpuset *cs)
+{
+   return test_bit(CS_RELEASED_RESOURCE, >flags);
+}
+
 /*
  * Increment this integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -543,7 +550,7 @@ static void cpuset_release_agent(const c
 static void check_for_release(struct cpuset *cs, char **ppathbuf)
 {
if (notify_on_release(cs) && atomic_read(>count) == 0 &&
-   list_empty(>children)) {
+   list_empty(>children)) {
char *buf;
 
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -835,7 +842,8 @@ update_cpu_domains_tree(struct cpuset *r
 
while (__kfifo_get(queue, (unsigned char *), sizeof(cp))) {
list_for_each_entry(child, >children, sibling)
-   __kfifo_put(queue,(unsigned char *),sizeof(child));
+   __kfifo_put(queue, (unsigned char *),
+   sizeof(child));
update_cpu_domains(cp);
}
 
@@ -1101,7 +1109,7 @@ static int update_flag(cpuset_flagbits_t
mutex_unlock(_mutex);
 
if (cpu_exclusive_changed)
-update_cpu_domains_tree(cs);
+   update_cpu_domains_tree(cs);
return 0;
 }
 
@@ -1279,6 +1287,7 @@ static int attach_task(struct cpuset *cs
 
from = oldcs->mems_allowed;
to = cs->mems_allowed;
+   set_bit(CS_RELEASED_RESOURCE, >flags);
 
mutex_unlock(_mutex);
 
@@ -1361,6 +1370,10 @@ static ssize_t cpuset_common_file_write(
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
break;
case FILE_NOTIFY_ON_RELEASE:
+   /* Even if the cpuset had been emptied in the past
+  it must not be considered for release until it has
+  become non-empty again. */
+   clear_bit(CS_RELEASED_RESOURCE, >flags);
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
break;
case FILE_MEMORY_MIGRATE:
@@ -2014,6 +2027,7 @@ static int cpuset_rmdir(struct inode *un
cpuset_d_remove_dir(d);
dput(d);
number_of_cpusets--;
+   set_bit(CS_RELEASED_RESOURCE, >flags);
mutex_unlock(_mutex);
if (list_empty(>children))
check_for_release(parent, );
@@ -2081,50 +2095,188 @@ out:
 }
 
 /*
+ * Move every task that is a member of cpuset "from" to cpuset "to".
+ *
+ * Called with both manage_sem and callback_sem held
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+   int moved=0;
+   struct task_struct *g, *tsk;
+
+   read_lock(_lock);
+   do_each_thread(g, tsk) {
+   if (tsk->cpuset == from) {
+   moved++;
+   task_lock(tsk);
+   tsk->cpuset = to;
+   task_unlock(tsk);
+   }
+   } while_each_thread(g, tsk);
+   read_unlock(_lock);
+   atomic_add(moved, >count);
+   atomic_set(>count, 0);
+}
+
+/*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpuset

[PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation

2007-05-21 Thread Cliff Wickman


This patch reconciles cpusets and sched_domains that get out of sync
due to hotplug disabling and re-enabling of cpu's.

Here is an example of how the problem can occur:

   system of cpu's 0-31
   create cpuset /x  16-31
   create cpuset /x/y  16-23
   all cpu_exclusive

   disable cpu 17
 x is now16,18-31
 x/y is now 16,18-23
   enable cpu 17
 x and x/y are unchanged

   to restore the cpusets:
 echo 16-31 > /dev/cpuset/x
 echo 16-23 > /dev/cpuset/x/y

   At the first echo, update_cpu_domains() is called for cpuset x/.

   The system is partitioned between:
its parent, the root cpuset of 0-31, minus its
children (x/ is 16-31): 0-15
and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31

   The sched_domain's for parent 0-15 are updated.
   The sched_domain's for current 17,24-31 are updated.

   But 16 has been untouched.
   As a result, 17's SD points to sched_group_phys[17] which is the only
   sched_group_phys on 17's list.  It points to itself.
   But 16's SD points to sched_group_phys[16], which still points to
   sched_group_phys[17].
   When cpu 16 executes find_busiest_group() it will hang on the non-
   circular sched_group list.
   
This solution is to update the sched_domain's for the cpuset
whose cpu's were changed and, in addition, all its children.
Instead of calling update_cpu_domains(), call update_cpu_domains_tree(),
which calls update_cpu_domains() for every node from the one specified
down to all its children.

The extra sched_domain reconstruction is overhead, but only at the
frequency of administrative change to the cpuset.

There seems to be no administrative procedural work-around.  In the
example above one could not reverse the two echo's and set x/y before
x/.  It is not logical, so not allowed (Permission denied).

Thus the patch to cpuset.c makes the sched_domain's correct.

This patch also includes checks in find_busiest_group() and
find_idlest_group() that break from their loops on a sched_group that
points to itself.  This is needed because cpu's are going through
load balancing before all sched_domains have been reconstructed (see
the example above).

Thus the patch to sched.c prevents the hangs that would otherwise occur
until the sched_domain's are made correct.

Diffed against 2.6.21

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |   43 +++
 kernel/sched.c  |   18 ++
 2 files changed, 53 insertions(+), 8 deletions(-)

Index: linus.070504/kernel/sched.c
===
--- linus.070504.orig/kernel/sched.c
+++ linus.070504/kernel/sched.c
@@ -1211,11 +1211,14 @@ static inline unsigned long cpu_avg_load
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
-   struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+   struct sched_group *idlest = NULL, *this = sd->groups, *group = 
sd->groups;
+   struct sched_group *self, *prev;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
+   prev = group;
+   self = group;
do {
unsigned long load, avg_load;
int local_group;
@@ -1251,8 +1254,10 @@ find_idlest_group(struct sched_domain *s
idlest = group;
}
 nextgroup:
+   prev = self;
+   self = group;
group = group->next;
-   } while (group != sd->groups);
+   } while (group != sd->groups && group != self && group != prev);
 
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -2276,7 +2281,8 @@ find_busiest_group(struct sched_domain *
   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
   cpumask_t *cpus, int *balance)
 {
-   struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+   struct sched_group *busiest = NULL, *this = sd->groups, *group = 
sd->groups;
+   struct sched_group *self, *prev;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
@@ -2299,6 +2305,8 @@ find_busiest_group(struct sched_domain *
else
load_idx = sd->idle_idx;
 
+   prev = group;
+   self = group;
do {
unsigned long load, group_capacity;
int local_group;
@@ -2427,8 +2435,10 @@ find_busiest_group(struct sched_domain *
}
 group_next:
 #endif
+   prev = self;
+   self = group;
group = group->next;
-   } while (group != sd->groups

hotplug cpu: PATCHes for 3 issues

2007-05-21 Thread Cliff Wickman


In the 2.6.21 kernel there are still 3 hotplug issues that are cpuset-
related, and that I find to still be problems.   And for which I offer
patches.

These have been submitted before, and subsequently cleaned up per
comments received.  I'm resubmitting all 3 for consideration and further
comment.

1)  [PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation
2)  [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
3)  [PATCH 1/1] hotplug cpu: migrate a task within its cpuset

1) Reconciles cpusets and sched_domains that get out of sync
   due to hotplug disabling and re-enabling of cpu's.
   Tasks can get into infinite hangs without this fix.
 kernel/cpuset.c
 kernel/sched.c 

2) When a cpuset is emptied by disabling its cpus, move tasks to 
   a parent cpuset.
   This is a correction of the current procedure, which moves such
   tasks to the wrong cpuset.
 kernel/cpuset.c

3) Causes a task running on a disabled cpu to migrate to a cpu within
   its cpuset.
   This behavior is particularly important for a NUMA system on which
   tasks have been explicitly placed.
 kernel/sched.c
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


hotplug cpu: PATCHes for 3 issues

2007-05-21 Thread Cliff Wickman


In the 2.6.21 kernel there are still 3 hotplug issues that are cpuset-
related, and that I find to still be problems.   And for which I offer
patches.

These have been submitted before, and subsequently cleaned up per
comments received.  I'm resubmitting all 3 for consideration and further
comment.

1)  [PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation
2)  [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
3)  [PATCH 1/1] hotplug cpu: migrate a task within its cpuset

1) Reconciles cpusets and sched_domains that get out of sync
   due to hotplug disabling and re-enabling of cpu's.
   Tasks can get into infinite hangs without this fix.
 kernel/cpuset.c
 kernel/sched.c 

2) When a cpuset is emptied by disabling its cpus, move tasks to 
   a parent cpuset.
   This is a correction of the current procedure, which moves such
   tasks to the wrong cpuset.
 kernel/cpuset.c

3) Causes a task running on a disabled cpu to migrate to a cpu within
   its cpuset.
   This behavior is particularly important for a NUMA system on which
   tasks have been explicitly placed.
 kernel/sched.c
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation

2007-05-21 Thread Cliff Wickman


This patch reconciles cpusets and sched_domains that get out of sync
due to hotplug disabling and re-enabling of cpu's.

Here is an example of how the problem can occur:

   system of cpu's 0-31
   create cpuset /x  16-31
   create cpuset /x/y  16-23
   all cpu_exclusive

   disable cpu 17
 x is now16,18-31
 x/y is now 16,18-23
   enable cpu 17
 x and x/y are unchanged

   to restore the cpusets:
 echo 16-31  /dev/cpuset/x
 echo 16-23  /dev/cpuset/x/y

   At the first echo, update_cpu_domains() is called for cpuset x/.

   The system is partitioned between:
its parent, the root cpuset of 0-31, minus its
children (x/ is 16-31): 0-15
and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31

   The sched_domain's for parent 0-15 are updated.
   The sched_domain's for current 17,24-31 are updated.

   But 16 has been untouched.
   As a result, 17's SD points to sched_group_phys[17] which is the only
   sched_group_phys on 17's list.  It points to itself.
   But 16's SD points to sched_group_phys[16], which still points to
   sched_group_phys[17].
   When cpu 16 executes find_busiest_group() it will hang on the non-
   circular sched_group list.
   
This solution is to update the sched_domain's for the cpuset
whose cpu's were changed and, in addition, all its children.
Instead of calling update_cpu_domains(), call update_cpu_domains_tree(),
which calls update_cpu_domains() for every node from the one specified
down to all its children.

The extra sched_domain reconstruction is overhead, but only at the
frequency of administrative change to the cpuset.

There seems to be no administrative procedural work-around.  In the
example above one could not reverse the two echo's and set x/y before
x/.  It is not logical, so not allowed (Permission denied).

Thus the patch to cpuset.c makes the sched_domain's correct.

This patch also includes checks in find_busiest_group() and
find_idlest_group() that break from their loops on a sched_group that
points to itself.  This is needed because cpu's are going through
load balancing before all sched_domains have been reconstructed (see
the example above).

Thus the patch to sched.c prevents the hangs that would otherwise occur
until the sched_domain's are made correct.

Diffed against 2.6.21

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 kernel/cpuset.c |   43 +++
 kernel/sched.c  |   18 ++
 2 files changed, 53 insertions(+), 8 deletions(-)

Index: linus.070504/kernel/sched.c
===
--- linus.070504.orig/kernel/sched.c
+++ linus.070504/kernel/sched.c
@@ -1211,11 +1211,14 @@ static inline unsigned long cpu_avg_load
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
-   struct sched_group *idlest = NULL, *this = NULL, *group = sd-groups;
+   struct sched_group *idlest = NULL, *this = sd-groups, *group = 
sd-groups;
+   struct sched_group *self, *prev;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd-forkexec_idx;
int imbalance = 100 + (sd-imbalance_pct-100)/2;
 
+   prev = group;
+   self = group;
do {
unsigned long load, avg_load;
int local_group;
@@ -1251,8 +1254,10 @@ find_idlest_group(struct sched_domain *s
idlest = group;
}
 nextgroup:
+   prev = self;
+   self = group;
group = group-next;
-   } while (group != sd-groups);
+   } while (group != sd-groups  group != self  group != prev);
 
if (!idlest || 100*this_load  imbalance*min_load)
return NULL;
@@ -2276,7 +2281,8 @@ find_busiest_group(struct sched_domain *
   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
   cpumask_t *cpus, int *balance)
 {
-   struct sched_group *busiest = NULL, *this = NULL, *group = sd-groups;
+   struct sched_group *busiest = NULL, *this = sd-groups, *group = 
sd-groups;
+   struct sched_group *self, *prev;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
@@ -2299,6 +2305,8 @@ find_busiest_group(struct sched_domain *
else
load_idx = sd-idle_idx;
 
+   prev = group;
+   self = group;
do {
unsigned long load, group_capacity;
int local_group;
@@ -2427,8 +2435,10 @@ find_busiest_group(struct sched_domain *
}
 group_next:
 #endif
+   prev = self;
+   self = group;
group = group-next;
-   } while (group != sd-groups);
+   } while (group != sd-groups  group != self  group != prev);
 
if (!busiest

[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent

2007-05-21 Thread Cliff Wickman


This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.

At that point, any tasks in that cpuset were incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
 of its parent, which may overlap its exclusive sibling.)

Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.

And the empty cpuset should be removed (if it is flagged notify_on_release).

This patch uses a workqueue thread to call the function that deletes the cpuset.
That way we avoid the complexity of the cpuset locks.

(I've been working with Paul Jackson on this patch, and there is still a
 little functional subtlety to work out. Can be tweaked later.)

Diffed against 2.6.21

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 kernel/cpuset.c |  221 
 1 file changed, 191 insertions(+), 30 deletions(-)

Index: linus.070504/kernel/cpuset.c
===
--- linus.070504.orig/kernel/cpuset.c
+++ linus.070504/kernel/cpuset.c
@@ -54,6 +54,7 @@
 #include asm/atomic.h
 #include linux/mutex.h
 #include linux/kfifo.h
+#include linux/workqueue.h
 
 #define CPUSET_SUPER_MAGIC 0x27e0eb
 
@@ -111,6 +112,7 @@ typedef enum {
CS_NOTIFY_ON_RELEASE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+   CS_RELEASED_RESOURCE,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -149,6 +151,11 @@ static inline int is_spread_slab(const s
return test_bit(CS_SPREAD_SLAB, cs-flags);
 }
 
+static inline int has_released_a_resource(const struct cpuset *cs)
+{
+   return test_bit(CS_RELEASED_RESOURCE, cs-flags);
+}
+
 /*
  * Increment this integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -543,7 +550,7 @@ static void cpuset_release_agent(const c
 static void check_for_release(struct cpuset *cs, char **ppathbuf)
 {
if (notify_on_release(cs)  atomic_read(cs-count) == 0 
-   list_empty(cs-children)) {
+   list_empty(cs-children)) {
char *buf;
 
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -835,7 +842,8 @@ update_cpu_domains_tree(struct cpuset *r
 
while (__kfifo_get(queue, (unsigned char *)cp, sizeof(cp))) {
list_for_each_entry(child, cp-children, sibling)
-   __kfifo_put(queue,(unsigned char *)child,sizeof(child));
+   __kfifo_put(queue, (unsigned char *)child,
+   sizeof(child));
update_cpu_domains(cp);
}
 
@@ -1101,7 +1109,7 @@ static int update_flag(cpuset_flagbits_t
mutex_unlock(callback_mutex);
 
if (cpu_exclusive_changed)
-update_cpu_domains_tree(cs);
+   update_cpu_domains_tree(cs);
return 0;
 }
 
@@ -1279,6 +1287,7 @@ static int attach_task(struct cpuset *cs
 
from = oldcs-mems_allowed;
to = cs-mems_allowed;
+   set_bit(CS_RELEASED_RESOURCE, oldcs-flags);
 
mutex_unlock(callback_mutex);
 
@@ -1361,6 +1370,10 @@ static ssize_t cpuset_common_file_write(
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
break;
case FILE_NOTIFY_ON_RELEASE:
+   /* Even if the cpuset had been emptied in the past
+  it must not be considered for release until it has
+  become non-empty again. */
+   clear_bit(CS_RELEASED_RESOURCE, cs-flags);
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
break;
case FILE_MEMORY_MIGRATE:
@@ -2014,6 +2027,7 @@ static int cpuset_rmdir(struct inode *un
cpuset_d_remove_dir(d);
dput(d);
number_of_cpusets--;
+   set_bit(CS_RELEASED_RESOURCE, parent-flags);
mutex_unlock(callback_mutex);
if (list_empty(parent-children))
check_for_release(parent, pathbuf);
@@ -2081,50 +2095,188 @@ out:
 }
 
 /*
+ * Move every task that is a member of cpuset from to cpuset to.
+ *
+ * Called with both manage_sem and callback_sem held
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+   int moved=0;
+   struct task_struct *g, *tsk;
+
+   read_lock(tasklist_lock);
+   do_each_thread(g, tsk) {
+   if (tsk-cpuset == from) {
+   moved++;
+   task_lock(tsk);
+   tsk-cpuset = to;
+   task_unlock(tsk);
+   }
+   } while_each_thread(g, tsk);
+   read_unlock(tasklist_lock);
+   atomic_add(moved, to-count);
+   atomic_set(from-count, 0);
+}
+
+/*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing

[PATCH 1/1] hotplug cpu: migrate a task within its cpuset

2007-05-21 Thread Cliff Wickman


(this is a third submission -- corrects a locking/blocking issue pointed
 out by Nathan Lynch)

When a cpu is disabled, move_task_off_dead_cpu() is called for tasks
that have been running on that cpu.

Currently, such a task is migrated:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any cpu which is both online and among that task's cpus_allowed

It is typical of a multithreaded application running on a large NUMA system
to have its tasks confined to a cpuset so as to cluster them near the
memory that they share. Furthermore, it is typical to explicitly place such
a task on a specific cpu in that cpuset.  And in that case the task's
cpus_allowed includes only a single cpu.

This patch inserts a preference to migrate such a task to some cpu within
its cpuset (and set its cpus_allowed to its entire cpuset).

With this patch, migrate the task to:
 1) to any cpu on the same node as the disabled cpu, which is both online
and among that task's cpus_allowed
 2) to any online cpu within the task's cpuset
 3) to any cpu which is both online and among that task's cpus_allowed


In order to do this, move_task_off_dead_cpu() must make a call to
cpuset_cpus_allowed(), which may block.

move_task_off_dead_cpu() has been within a critical region when called
from migrate_live_tasks().  So this patch also changes migrate_live_tasks()
to enable interrupts before calling move_task_off_dead_cpu().
Since the tasklist_lock is dropped, the list scan must be restarted from
the top.
It locks the migrating task by bumping its usage count.
It disables interrupts in move_task_off_dead_cpu() before the
 call to __migrate_task().

This is the outline of the locking surrounding calls to
move_task_off_dead_cpu(), after applying this patch:

  migration_call()
  | case CPU_DEAD
  |   migrate_live_tasks(cpu)
  |   | recheck:
  |   | write_lock_irq(tasklist_lock)
  |   | do_each_thread(t, p) {
  |   | if (task_cpu(p) == src_cpu)
  |   | get_task_struct(p)
  |   | write_unlock_irq(tasklist_lock)
  |   | move_task_off_dead_cpu(src_cpu, p)  noncritical
  |   | put_task_struct(p);
  |   | goto recheck
  |   | } while_each_thread(t, p)
  |   | write_unlock_irq(tasklist_lock)
  |
  |   rq = task_rq_lock(rq-idle, flags)
  |
  |   migrate_dead_tasks(cpu)
  |   | for (arr = 0; arr  2; arr++) {
  |   |   for (i = 0; i  MAX_PRIO; i++) {
  |   | while (!list_empty(list))
  |   |   migrate_dead(dead_cpu
  |   | get_task_struct(p)
  |   | spin_unlock_irq(rq-lock)
  |   | move_task_off_dead_cpu(dead_cpu, p) noncritcal
  |   | spin_lock_irq(rq-lock)
  |   | put_task_struct(p)
  |
  |   task_rq_unlock(rq, flags)

[Side note: a task may be migrated off of its cpuset, but is still attached to
 that cpuset (by pointer and reference count).  The cpuset will not be
 released.  This patch does not change that.]

Diffed against 2.6.21

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

 kernel/sched.c |   31 ---
 1 file changed, 28 insertions(+), 3 deletions(-)

Index: linus.070504/kernel/sched.c
===
--- linus.070504.orig/kernel/sched.c
+++ linus.070504/kernel/sched.c
@@ -4989,7 +4989,7 @@ wait_to_die:
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if neccessary.
- * NOTE: interrupts should be disabled by the caller
+ * NOTE: interrupts are not disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -5008,6 +5008,17 @@ restart:
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(p-cpus_allowed);
 
+   /* try to stay on the same cpuset */
+   if (dest_cpu == NR_CPUS) {
+   /*
+* Call to cpuset_cpus_allowed may sleep, so we depend
+* on move_task_off_dead_cpu() being called in a non-critical
+* region.
+*/
+   p-cpus_allowed = cpuset_cpus_allowed(p);
+   dest_cpu = any_online_cpu(p-cpus_allowed);
+   }
+
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
rq = task_rq_lock(p, flags);
@@ -5025,8 +5036,16 @@ restart:
   longer affine to cpu%d\n,
   p-pid, p-comm, dead_cpu);
}
-   if (!__migrate_task(p, dead_cpu, dest_cpu))
+   /*
+* __migrate_task() requires interrupts to be disabled
+*/
+   local_irq_disable();
+   if (!__migrate_task(p, dead_cpu, dest_cpu)) {
+   local_irq_enable();
goto restart;
+   }
+   local_irq_enable();
+   return;
 }
 
 /*
@@ -5054,14 +5073,20 @@ static void migrate_live_tasks(int src_c
 {
struct task_struct *p, *t;
 
+restartlist

Re: getting processor numbers

2007-04-04 Thread Cliff Wickman

On Wed, Apr 04, 2007 at 02:47:32AM -0400, Jakub Jelinek wrote:
> On Tue, Apr 03, 2007 at 07:04:58PM -0700, Paul Jackson wrote:
> > There are really at least four "number of CPUs" answers here, and we
> > should be aware of which we are providing.  There are, in order of
> > decreasing size:
> >  1) the size of the kernels cpumask_t (NR_CPUS),
> >  2) the maximum number of CPUs that might ever be hotplugged into a
> > booted system,
> >  3) the current number of CPUs online in that system, and
> >  4) the number of CPUs that the current task is allowed to use.
>
> sysconf(_SC_NPROCESSORS_CONF) should IMHO return (2) (this currently
> scans /proc/cpuinfo on alpha and sparc{,64} for ((ncpus|CPUs) probed|cpus 
> detected)
> and for the rest just returns sysconf(_SC_NPROCESSORS_ONLN)).
> Neither of the sysconf returned values should be affected by affinity.

I'm looking at an ia64 system, and when a cpu is hot-unplugged it is removed
from /proc/cpuinfo.  Wouldn't /sys/devices/system/cpu/ be a better
source for 2) ?

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: getting processor numbers

2007-04-04 Thread Cliff Wickman

On Wed, Apr 04, 2007 at 02:47:32AM -0400, Jakub Jelinek wrote:
 On Tue, Apr 03, 2007 at 07:04:58PM -0700, Paul Jackson wrote:
  There are really at least four number of CPUs answers here, and we
  should be aware of which we are providing.  There are, in order of
  decreasing size:
   1) the size of the kernels cpumask_t (NR_CPUS),
   2) the maximum number of CPUs that might ever be hotplugged into a
  booted system,
   3) the current number of CPUs online in that system, and
   4) the number of CPUs that the current task is allowed to use.

 sysconf(_SC_NPROCESSORS_CONF) should IMHO return (2) (this currently
 scans /proc/cpuinfo on alpha and sparc{,64} for ((ncpus|CPUs) probed|cpus 
 detected)
 and for the rest just returns sysconf(_SC_NPROCESSORS_ONLN)).
 Neither of the sysconf returned values should be affected by affinity.

I'm looking at an ia64 system, and when a cpu is hot-unplugged it is removed
from /proc/cpuinfo.  Wouldn't /sys/devices/system/cpu/ be a better
source for 2) ?

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] cpusets/sched_domain reconciliation

2007-03-22 Thread Cliff Wickman
Hello Andrew,

On Thu, Mar 22, 2007 at 02:21:52PM -0700, Andrew Morton wrote:
> On Tue, 20 Mar 2007 13:14:35 -0600
> [EMAIL PROTECTED] (Cliff Wickman) wrote:
> 
> > This patch reconciles cpusets and sched_domains that get out of sync
> > due to disabling and re-enabling of cpu's.
> 
> I get three-out-of-three rejects in cpuset.c.  I could fix them, but I
> wouldn't be very confident that the result works at runtime.  2.6.20-rc6 was
> a long time ago - please, always raise patches against the latest mainline
> kernel (the daily git snapshot suffices).

Will do.
 
> Recursion is a big no-no in kernel.  Is there any way in which it can be
> avoided?  Is Dinakar's implementation also recursive?

I was a little reluctant to use recursion, but this use parallels another,
existing such use in cpuset.c  The depth of the recursion is only the depth of
the cpuset hierarchy, which is set up by an administrator, and which is
logically limited by the number of cpus in the system.  e.g. it would be
hard to even deliberately organize 16 cpus into a hierarchy greater
than 16 layers deep, even if you wanted cpusets of single cpus.
We've not run into such a problem on systems of hundreds of cpus.
I would think it's safe.  What do you think?

Dinakar's solution is not written yet, as far as I know.  I'll copy him
for his status.

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] cpusets/sched_domain reconciliation

2007-03-22 Thread Cliff Wickman
Hello Andrew,

On Thu, Mar 22, 2007 at 02:21:52PM -0700, Andrew Morton wrote:
 On Tue, 20 Mar 2007 13:14:35 -0600
 [EMAIL PROTECTED] (Cliff Wickman) wrote:
 
  This patch reconciles cpusets and sched_domains that get out of sync
  due to disabling and re-enabling of cpu's.
 
 I get three-out-of-three rejects in cpuset.c.  I could fix them, but I
 wouldn't be very confident that the result works at runtime.  2.6.20-rc6 was
 a long time ago - please, always raise patches against the latest mainline
 kernel (the daily git snapshot suffices).

Will do.
 
 Recursion is a big no-no in kernel.  Is there any way in which it can be
 avoided?  Is Dinakar's implementation also recursive?

I was a little reluctant to use recursion, but this use parallels another,
existing such use in cpuset.c  The depth of the recursion is only the depth of
the cpuset hierarchy, which is set up by an administrator, and which is
logically limited by the number of cpus in the system.  e.g. it would be
hard to even deliberately organize 16 cpus into a hierarchy greater
than 16 layers deep, even if you wanted cpusets of single cpus.
We've not run into such a problem on systems of hundreds of cpus.
I would think it's safe.  What do you think?

Dinakar's solution is not written yet, as far as I know.  I'll copy him
for his status.

-- 
Cliff Wickman
Silicon Graphics, Inc.
[EMAIL PROTECTED]
(651) 683-3824
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent

2007-03-21 Thread Cliff Wickman


Submission #2: after style changes recommended by Randy Dunlap


This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.

At that point, any tasks in that cpuset are incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
 of its parent, which may overlap its exclusive sibling.)

Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.

And the empty cpuset should be removed (if it is flagged notify_on_release).

This patch contains the added complexity of taking care not to do memory
allocation while holding the cpusets callback_mutex. And it makes use of the
"cpuset_release_agent" to do the cpuset removals.

It might be simpler to use a separate thread or workqueue. But such code
has not yet been written.

Diffed against 2.6.20-rc6

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  200 ++--
 1 file changed, 180 insertions(+), 20 deletions(-)

Index: morton.070205/kernel/cpuset.c
===
--- morton.070205.orig/kernel/cpuset.c
+++ morton.070205/kernel/cpuset.c
@@ -112,6 +112,12 @@ typedef enum {
CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+struct path_list_element {
+   struct list_head list;
+   struct cpuset *cs;
+   char *path;
+};
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus
  * the time manage_mutex is held.
  */
 
-static void cpuset_release_agent(const char *pathbuf)
+static void cpuset_release_agent(const char *pathbuf, int releasepath)
 {
char *argv[3], *envp[3];
int i;
@@ -518,7 +524,8 @@ static void cpuset_release_agent(const c
envp[i] = NULL;
 
call_usermodehelper(argv[0], argv, envp, 0);
-   kfree(pathbuf);
+   if (releasepath)
+   kfree(pathbuf);
 }
 
 /*
@@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write(
retval = nbytes;
 out2:
mutex_unlock(_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
 out1:
kfree(buffer);
return retval;
@@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un
if (list_empty(>children))
check_for_release(parent, );
mutex_unlock(_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
return 0;
 }
 
@@ -2053,13 +2060,33 @@ out:
 }
 
 /*
+ * move every task that is a member of cpuset "from" to cpuset "to"
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+   int moved=0;
+   struct task_struct *g, *tsk;
+
+   read_lock(_lock);
+   do_each_thread(g, tsk) {
+   if (tsk->cpuset == from) {
+   moved++;
+   task_lock(tsk);
+   tsk->cpuset = to;
+   task_unlock(tsk);
+   }
+   } while_each_thread(g, tsk);
+   read_unlock(_lock);
+   atomic_add(moved, >count);
+   atomic_set(>count, 0);
+}
+
+/*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty cpuset
+ * to its next-highest non-empty parent.  And remove the empty cpuset.
  *
  * This routine is intentionally inefficient in a couple of regards.
  * It will check all cpusets in a subtree even if the top cpuset of
@@ -2070,20 +2097,104 @@ out:
  *
  * Call with both manage_mutex and callback_mutex held.
  *
+ * Takes tasklist_lock, and task_lock() for cpuset members that are
+ * moved to another cpuset.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void remove_tasks_in_empty_cpusets_in_subtree(
+   const struct cpuset *cur,
+   struct list_head *empty_list,
+   struct path_list_element **ple_array,
+   int *ple_availp, int ple_count)
+{
+   int npids, ple_used=0;
+   struct cpuset *c, *parent;
+   struct path_list_element *ple;
+
+   /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
+   list_for_each_entry(c, >children, sibling) {
+   remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
+   ple_array, ple_availp, ple_count);
+   

[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent

2007-03-21 Thread Cliff Wickman


Submission #2: after style changes recommended by Randy Dunlap


This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.

At that point, any tasks in that cpuset are incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
 of its parent, which may overlap its exclusive sibling.)

Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.

And the empty cpuset should be removed (if it is flagged notify_on_release).

This patch contains the added complexity of taking care not to do memory
allocation while holding the cpusets callback_mutex. And it makes use of the
cpuset_release_agent to do the cpuset removals.

It might be simpler to use a separate thread or workqueue. But such code
has not yet been written.

Diffed against 2.6.20-rc6

Signed-off-by: Cliff Wickman [EMAIL PROTECTED]

---
 kernel/cpuset.c |  200 ++--
 1 file changed, 180 insertions(+), 20 deletions(-)

Index: morton.070205/kernel/cpuset.c
===
--- morton.070205.orig/kernel/cpuset.c
+++ morton.070205/kernel/cpuset.c
@@ -112,6 +112,12 @@ typedef enum {
CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+struct path_list_element {
+   struct list_head list;
+   struct cpuset *cs;
+   char *path;
+};
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus
  * the time manage_mutex is held.
  */
 
-static void cpuset_release_agent(const char *pathbuf)
+static void cpuset_release_agent(const char *pathbuf, int releasepath)
 {
char *argv[3], *envp[3];
int i;
@@ -518,7 +524,8 @@ static void cpuset_release_agent(const c
envp[i] = NULL;
 
call_usermodehelper(argv[0], argv, envp, 0);
-   kfree(pathbuf);
+   if (releasepath)
+   kfree(pathbuf);
 }
 
 /*
@@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write(
retval = nbytes;
 out2:
mutex_unlock(manage_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
 out1:
kfree(buffer);
return retval;
@@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un
if (list_empty(parent-children))
check_for_release(parent, pathbuf);
mutex_unlock(manage_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
return 0;
 }
 
@@ -2053,13 +2060,33 @@ out:
 }
 
 /*
+ * move every task that is a member of cpuset from to cpuset to
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+   int moved=0;
+   struct task_struct *g, *tsk;
+
+   read_lock(tasklist_lock);
+   do_each_thread(g, tsk) {
+   if (tsk-cpuset == from) {
+   moved++;
+   task_lock(tsk);
+   tsk-cpuset = to;
+   task_unlock(tsk);
+   }
+   } while_each_thread(g, tsk);
+   read_unlock(tasklist_lock);
+   atomic_add(moved, to-count);
+   atomic_set(from-count, 0);
+}
+
+/*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty cpuset
+ * to its next-highest non-empty parent.  And remove the empty cpuset.
  *
  * This routine is intentionally inefficient in a couple of regards.
  * It will check all cpusets in a subtree even if the top cpuset of
@@ -2070,20 +2097,104 @@ out:
  *
  * Call with both manage_mutex and callback_mutex held.
  *
+ * Takes tasklist_lock, and task_lock() for cpuset members that are
+ * moved to another cpuset.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void remove_tasks_in_empty_cpusets_in_subtree(
+   const struct cpuset *cur,
+   struct list_head *empty_list,
+   struct path_list_element **ple_array,
+   int *ple_availp, int ple_count)
+{
+   int npids, ple_used=0;
+   struct cpuset *c, *parent;
+   struct path_list_element *ple;
+
+   /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
+   list_for_each_entry(c, cur-children, sibling) {
+   remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
+   ple_array, ple_availp, ple_count

[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent

2007-03-20 Thread Cliff Wickman

From: Cliff Wickman <[EMAIL PROTECTED]>

This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.

At that point, any tasks in that cpuset are incorrectly moved (as I recall,
they were move to a sibling cpuset).
Such tasks should be move the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.

And the empty cpuset should be removed (if it is flagged notify_on_release).

This patch contains the added complexity of taking care not to do memory
allocation while holding the cpusets callback_mutex. And it makes use of the
"cpuset_release_agent" to do the cpuset removals.

It might be simpler to use a separate thread or workqueue. But such code
has not yet been written.

Diffed against 2.6.20-rc6

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  200 ++--
 1 file changed, 180 insertions(+), 20 deletions(-)

Index: morton.070205/kernel/cpuset.c
===
--- morton.070205.orig/kernel/cpuset.c
+++ morton.070205/kernel/cpuset.c
@@ -112,6 +112,12 @@ typedef enum {
CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+struct path_list_element {
+   struct list_head list;
+   struct cpuset *cs;
+   char *path;
+};
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus
  * the time manage_mutex is held.
  */
 
-static void cpuset_release_agent(const char *pathbuf)
+static void cpuset_release_agent(const char *pathbuf, int releasepath)
 {
char *argv[3], *envp[3];
int i;
@@ -518,7 +524,8 @@ static void cpuset_release_agent(const c
envp[i] = NULL;
 
call_usermodehelper(argv[0], argv, envp, 0);
-   kfree(pathbuf);
+   if (releasepath)
+   kfree(pathbuf);
 }
 
 /*
@@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write(
retval = nbytes;
 out2:
mutex_unlock(_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
 out1:
kfree(buffer);
return retval;
@@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un
if (list_empty(>children))
check_for_release(parent, );
mutex_unlock(_mutex);
-   cpuset_release_agent(pathbuf);
+   cpuset_release_agent(pathbuf, 1);
return 0;
 }
 
@@ -2053,13 +2060,33 @@ out:
 }
 
 /*
+ * move every task that is a member of cpuset "from" to cpuset "to"
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+   int moved=0;
+   struct task_struct *g, *tsk;
+
+   read_lock(_lock);
+   do_each_thread(g, tsk) {
+   if (tsk->cpuset == from) {
+   moved++;
+   task_lock(tsk);
+   tsk->cpuset = to;
+   task_unlock(tsk);
+   }
+   } while_each_thread(g, tsk);
+   read_unlock(_lock);
+   atomic_add(moved, >count);
+   atomic_set(>count, 0);
+}
+
+/*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty cpuset
+ * to its next-highest non-empty parent.  And remove the empty cpuset.
  *
  * This routine is intentionally inefficient in a couple of regards.
  * It will check all cpusets in a subtree even if the top cpuset of
@@ -2070,20 +2097,100 @@ out:
  *
  * Call with both manage_mutex and callback_mutex held.
  *
+ * Takes tasklist_lock, and task_lock() for cpuset members that are
+ * moved to another cpuset.
+ *
  * Recursive, on depth of cpuset subtree.
  */
 
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+static void remove_tasks_in_empty_cpusets_in_subtree(const struct cpuset *cur, 
struct list_head *empty_list, struct path_list_element **ple_array, int 
*ple_availp, int ple_count)
+{
+   int npids, ple_used=0;
+   struct cpuset *c, *parent;
+   struct path_list_element *ple;
+
+   /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
+   list_for_each_entry(c, >children, sibling) {
+   remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
+   ple_array, ple_availp, ple_count);
+   /*
+* If it has no online cpus or no online mems, move its tasks
+* to its next-highest non-empty paren

[PATCH 1/1] cpusets/sched_domain reconciliation

2007-03-20 Thread Cliff Wickman

From: Cliff Wickman <[EMAIL PROTECTED]>

This patch reconciles cpusets and sched_domains that get out of sync
due to disabling and re-enabling of cpu's.

Dinakar Guniguntala (IBM) is working on his own version of fixing this.
But as of this date that fix doesn't seem to be ready.

Here is an example of how the problem can occur:

   system of cpu's 0-31
   create cpuset /x  16-31
   create cpuset /x/y  16-23
   all cpu_exclusive

   disable cpu 17
 x is now16,18-31
 x/y is now 16,18-23
   enable cpu 17
 x and x/y are unchanged

   to restore the cpusets:
 echo 16-31 > /dev/cpuset/x
 echo 16-23 > /dev/cpuset/x/y

   At the first echo, update_cpu_domains() is called for cpuset x/.

   The system is partitioned between:
its parent, the root cpuset of 0-31, minus its
children (x/ is 16-31): 0-15
and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31

   The sched_domain's for parent 0-15 are updated.
   The sched_domain's for current 17,24-31 are updated.

   But 16 has been untouched.
   As a result, 17's SD points to sched_group_phys[17] which is the only
   sched_group_phys on 17's list.  It points to itself.
   But 16's SD points to sched_group_phys[16], which still points to
   sched_group_phys[17].
   When cpu 16 executes find_busiest_group() it will hang on the non-
   circular sched_group list.
   
This solution is to update the sched_domain's for the cpuset
whose cpu's were changed and, in addition, all its children.
The update_cpu_domains() will end with a (recursive) call to itself
for each child.
The extra sched_domain reconstruction is overhead, but only at the
frequency of administrative change to the cpusets.

This patch also includes checks in find_busiest_group() and
find_idlest_group() that break from their loops on a sched_group that
points to itself.  This is needed because other cpu's are going through
load balancing while the sched_domains are being reconstructed.

There seems to be no administrative procedural work-around.  In the
example above one could not reverse the two echo's and set x/y before
x/.  It is not logical, so not allowed (Permission denied).

Diffed against 2.6.20-rc6

Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |   11 +--
 kernel/sched.c  |   19 +++
 2 files changed, 24 insertions(+), 6 deletions(-)

Index: morton.070205/kernel/sched.c
===
--- morton.070205.orig/kernel/sched.c
+++ morton.070205/kernel/sched.c
@@ -1201,11 +1201,14 @@ static inline unsigned long cpu_avg_load
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
-   struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+   struct sched_group *idlest = NULL, *this = sd->groups, *group = 
sd->groups;
+   struct sched_group *self, *prev;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
+   prev = group;
+   self = group;
do {
unsigned long load, avg_load;
int local_group;
@@ -1241,8 +1244,10 @@ find_idlest_group(struct sched_domain *s
idlest = group;
}
 nextgroup:
+   prev = self;
+   self = group;
group = group->next;
-   } while (group != sd->groups);
+   } while (group != sd->groups && group != self && group != prev);
 
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -2259,7 +2264,8 @@ find_busiest_group(struct sched_domain *
   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
   cpumask_t *cpus, int *balance)
 {
-   struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+   struct sched_group *busiest = NULL, *this = sd->groups, *group = 
sd->groups;
+   struct sched_group *self, *prev;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
@@ -2282,6 +2288,8 @@ find_busiest_group(struct sched_domain *
else
load_idx = sd->idle_idx;
 
+   prev = group;
+   self = group;
do {
unsigned long load, group_capacity;
int local_group;
@@ -2410,8 +2418,11 @@ find_busiest_group(struct sched_domain *
}
 group_next:
 #endif
+   prev = self;
+   self = group;
group = group->next;
-   } while (group != sd->groups);
+   /* careful, a printk here can cause a spinlock hang */
+   } while (group != sd->groups && group != self &am

  1   2   >