[tip:x86/uv] x86/uv: Update the UV3 TLB shootdown logic
Commit-ID: a26fd71953711acb4884df84e393d52de57e4f17 Gitweb: http://git.kernel.org/tip/a26fd71953711acb4884df84e393d52de57e4f17 Author: Cliff Wickman AuthorDate: Wed, 14 May 2014 16:15:47 -0500 Committer: Ingo Molnar CommitDate: Thu, 5 Jun 2014 14:17:20 +0200 x86/uv: Update the UV3 TLB shootdown logic Update of TLB shootdown code for UV3. Kernel function native_flush_tlb_others() calls uv_flush_tlb_others() on UV to invalidate tlb page definitions on remote cpus. The UV systems have a hardware 'broadcast assist unit' which can be used to broadcast shootdown messages to all cpu's of selected nodes. The behavior of the BAU has changed only slightly with UV3: - UV3 is recognized with is_uv3_hub(). - UV2 functions and structures (uv2_xxx) are in most cases simply renamed to uv2_3_xxx. - Some UV2 error workarounds are not needed for UV3. (see uv_bau_message_interrupt and enable_timeouts) Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/e1wkgwh-0001yj...@eag09.americas.sgi.com [ Removed a few linebreak uglies. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 19 ++- arch/x86/platform/uv/tlb_uv.c| 69 ++-- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 0b46ef2..2d60a78 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -73,6 +73,7 @@ #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD(is_uv1_hub() ? \ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) +/* assuming UV3 is the same */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -93,6 +94,8 @@ #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT #define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT +#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT #define write_gmmr uv_write_global_mmr64 #define write_lmmr uv_write_local_mmr #define read_lmmr uv_read_local_mmr @@ -322,8 +325,9 @@ struct uv1_bau_msg_header { /* * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see figure 9-2 of harp_sys.pdf + * assuming UV3 is the same */ -struct uv2_bau_msg_header { +struct uv2_3_bau_msg_header { unsigned intbase_dest_nasid:15; /* nasid of the first bit */ /* bits 14:0 */ /* in uvhub map */ unsigned intdest_subnodeid:5; /* must be 0x10, for the LB */ @@ -395,7 +399,7 @@ struct bau_desc { */ union bau_msg_header { struct uv1_bau_msg_header uv1_hdr; - struct uv2_bau_msg_header uv2_hdr; + struct uv2_3_bau_msg_header uv2_3_hdr; } header; struct bau_msg_payload payload; @@ -631,11 +635,6 @@ struct bau_control { struct hub_and_pnode*thp; }; -static inline unsigned long read_mmr_uv2_status(void) -{ - return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); -} - static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); @@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - return i + xadd(>counter, i); + short __i = i; + asm volatile(LOCK_PREFIX "xaddw %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; } /* diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index dfe605a..ed161c6 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2012 Cliff Wickman , SGI. + * (c) 2008-2014 Cliff Wickman , SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) /* * The reverse of the above; converts a duration in ns to a duration in cycles. - */ + */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { struct cyc2ns_data *data = cyc2ns_read_begin(); @@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. * But not currently used. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) +static unsigned long uv2_3_read_status(unsigned long off
[tip:x86/uv] x86/uv: Update the UV3 TLB shootdown logic
Commit-ID: a26fd71953711acb4884df84e393d52de57e4f17 Gitweb: http://git.kernel.org/tip/a26fd71953711acb4884df84e393d52de57e4f17 Author: Cliff Wickman c...@sgi.com AuthorDate: Wed, 14 May 2014 16:15:47 -0500 Committer: Ingo Molnar mi...@kernel.org CommitDate: Thu, 5 Jun 2014 14:17:20 +0200 x86/uv: Update the UV3 TLB shootdown logic Update of TLB shootdown code for UV3. Kernel function native_flush_tlb_others() calls uv_flush_tlb_others() on UV to invalidate tlb page definitions on remote cpus. The UV systems have a hardware 'broadcast assist unit' which can be used to broadcast shootdown messages to all cpu's of selected nodes. The behavior of the BAU has changed only slightly with UV3: - UV3 is recognized with is_uv3_hub(). - UV2 functions and structures (uv2_xxx) are in most cases simply renamed to uv2_3_xxx. - Some UV2 error workarounds are not needed for UV3. (see uv_bau_message_interrupt and enable_timeouts) Signed-off-by: Cliff Wickman c...@sgi.com Link: http://lkml.kernel.org/r/e1wkgwh-0001yj...@eag09.americas.sgi.com [ Removed a few linebreak uglies. ] Signed-off-by: Ingo Molnar mi...@kernel.org --- arch/x86/include/asm/uv/uv_bau.h | 19 ++- arch/x86/platform/uv/tlb_uv.c| 69 ++-- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 0b46ef2..2d60a78 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -73,6 +73,7 @@ #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD(is_uv1_hub() ? \ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) +/* assuming UV3 is the same */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -93,6 +94,8 @@ #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT #define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT +#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT #define write_gmmr uv_write_global_mmr64 #define write_lmmr uv_write_local_mmr #define read_lmmr uv_read_local_mmr @@ -322,8 +325,9 @@ struct uv1_bau_msg_header { /* * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see figure 9-2 of harp_sys.pdf + * assuming UV3 is the same */ -struct uv2_bau_msg_header { +struct uv2_3_bau_msg_header { unsigned intbase_dest_nasid:15; /* nasid of the first bit */ /* bits 14:0 */ /* in uvhub map */ unsigned intdest_subnodeid:5; /* must be 0x10, for the LB */ @@ -395,7 +399,7 @@ struct bau_desc { */ union bau_msg_header { struct uv1_bau_msg_header uv1_hdr; - struct uv2_bau_msg_header uv2_hdr; + struct uv2_3_bau_msg_header uv2_3_hdr; } header; struct bau_msg_payload payload; @@ -631,11 +635,6 @@ struct bau_control { struct hub_and_pnode*thp; }; -static inline unsigned long read_mmr_uv2_status(void) -{ - return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); -} - static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); @@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - return i + xadd(v-counter, i); + short __i = i; + asm volatile(LOCK_PREFIX xaddw %0, %1 + : +r (i), +m (v-counter) + : : memory); + return i + __i; } /* diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index dfe605a..ed161c6 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2012 Cliff Wickman c...@sgi.com, SGI. + * (c) 2008-2014 Cliff Wickman c...@sgi.com, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) /* * The reverse of the above; converts a duration in ns to a duration in cycles. - */ + */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { struct cyc2ns_data *data = cyc2ns_read_begin(); @@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. * But not currently used. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) +static unsigned long
Re: [PATCH v5 1/5] vmcore: Introduce ELF header in new memory feature
notes_section = kmalloc(max_sz, GFP_KERNEL); > > if (!notes_section) > > return -ENOMEM; > > - rc = read_from_oldmem(notes_section, max_sz, , 0); > > + rc = elfcorehdr_read_notes(notes_section, max_sz, ); > > if (rc < 0) { > > kfree(notes_section); > > return rc; > > @@ -409,7 +439,8 @@ static int __init copy_notes_elf64(const > > if (phdr_ptr->p_type != PT_NOTE) > > continue; > > offset = phdr_ptr->p_offset; > > - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, , 0); > > + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, > > + ); > > if (rc < 0) > > return rc; > > notes_buf += phdr_ptr->p_memsz; > > @@ -510,7 +541,7 @@ static int __init update_note_header_siz > > notes_section = kmalloc(max_sz, GFP_KERNEL); > > if (!notes_section) > > return -ENOMEM; > > - rc = read_from_oldmem(notes_section, max_sz, , 0); > > + rc = elfcorehdr_read_notes(notes_section, max_sz, ); > > if (rc < 0) { > > kfree(notes_section); > > return rc; > > @@ -597,7 +628,8 @@ static int __init copy_notes_elf32(const > > if (phdr_ptr->p_type != PT_NOTE) > > continue; > > offset = phdr_ptr->p_offset; > > - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, , 0); > > + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, > > + ); > > if (rc < 0) > > return rc; > > notes_buf += phdr_ptr->p_memsz; > > @@ -793,7 +825,7 @@ static int __init parse_crash_elf64_head > > addr = elfcorehdr_addr; > > > > /* Read Elf header */ > > - rc = read_from_oldmem((char*), sizeof(Elf64_Ehdr), , 0); > > + rc = elfcorehdr_read((char *), sizeof(Elf64_Ehdr), ); > > if (rc < 0) > > return rc; > > > > @@ -820,7 +852,7 @@ static int __init parse_crash_elf64_head > > if (!elfcorebuf) > > return -ENOMEM; > > addr = elfcorehdr_addr; > > - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, , 0); > > + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, ); > > if (rc < 0) > > goto fail; > > > > @@ -849,7 +881,7 @@ static int __init parse_crash_elf32_head > > addr = elfcorehdr_addr; > > > > /* Read Elf header */ > > - rc = read_from_oldmem((char*), sizeof(Elf32_Ehdr), , 0); > > + rc = elfcorehdr_read((char *), sizeof(Elf32_Ehdr), ); > > if (rc < 0) > > return rc; > > > > @@ -875,7 +907,7 @@ static int __init parse_crash_elf32_head > > if (!elfcorebuf) > > return -ENOMEM; > > addr = elfcorehdr_addr; > > - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, , 0); > > + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, ); > > if (rc < 0) > > goto fail; > > > > @@ -902,7 +934,7 @@ static int __init parse_crash_elf_header > > int rc=0; > > > > addr = elfcorehdr_addr; > > - rc = read_from_oldmem(e_ident, EI_NIDENT, , 0); > > + rc = elfcorehdr_read(e_ident, EI_NIDENT, ); > > if (rc < 0) > > return rc; > > if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { > > @@ -935,7 +967,14 @@ static int __init vmcore_init(void) > > { > > int rc = 0; > > > > - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ > > + /* Allow architectures to allocate ELF header in 2nd kernel */ > > + rc = elfcorehdr_alloc(_addr, _size); > > + if (rc) > > + return rc; > > + /* > > +* If elfcorehdr= has been passed in cmdline or created in 2nd kernel, > > +* then capture the dump. > > +*/ > > if (!(is_vmcore_usable())) > > return rc; > > rc = parse_crash_elf_headers(); > > @@ -943,7 +982,11 @@ static int __init vmcore_init(void) > > pr_warn("Kdump: vmcore not initialized\n"); > > return rc; > > } > > - > > + elfcorehdr_free(elfcorehdr_addr); > > + /* > > +* elfcorehdr_addr must not be set to NULL here to keep > > +* is_kdump_kernel() working. > > +*/ > > proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, > > _vmcore_operations); > > if (proc_vmcore) > > proc_vmcore->size = vmcore_size; > > --- a/include/linux/crash_dump.h > > +++ b/include/linux/crash_dump.h > > @@ -12,6 +12,12 @@ > > extern unsigned long long elfcorehdr_addr; > > extern unsigned long long elfcorehdr_size; > > > > +extern int __weak elfcorehdr_alloc(unsigned long long *addr, > > + unsigned long long *size); > > +extern void __weak elfcorehdr_free(unsigned long long addr); > > +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); > > +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 > > *ppos); > > + > > extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, > > unsigned long, int); > > > > ___ > kexec mailing list > ke...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v5 1/5] vmcore: Introduce ELF header in new memory feature
= read_from_oldmem(notes_section, max_sz, offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, offset); if (rc 0) { kfree(notes_section); return rc; @@ -597,7 +628,8 @@ static int __init copy_notes_elf32(const if (phdr_ptr-p_type != PT_NOTE) continue; offset = phdr_ptr-p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr-p_memsz, offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr-p_memsz, + offset); if (rc 0) return rc; notes_buf += phdr_ptr-p_memsz; @@ -793,7 +825,7 @@ static int __init parse_crash_elf64_head addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)ehdr, sizeof(Elf64_Ehdr), addr, 0); + rc = elfcorehdr_read((char *)ehdr, sizeof(Elf64_Ehdr), addr); if (rc 0) return rc; @@ -820,7 +852,7 @@ static int __init parse_crash_elf64_head if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, addr); if (rc 0) goto fail; @@ -849,7 +881,7 @@ static int __init parse_crash_elf32_head addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)ehdr, sizeof(Elf32_Ehdr), addr, 0); + rc = elfcorehdr_read((char *)ehdr, sizeof(Elf32_Ehdr), addr); if (rc 0) return rc; @@ -875,7 +907,7 @@ static int __init parse_crash_elf32_head if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, addr); if (rc 0) goto fail; @@ -902,7 +934,7 @@ static int __init parse_crash_elf_header int rc=0; addr = elfcorehdr_addr; - rc = read_from_oldmem(e_ident, EI_NIDENT, addr, 0); + rc = elfcorehdr_read(e_ident, EI_NIDENT, addr); if (rc 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { @@ -935,7 +967,14 @@ static int __init vmcore_init(void) { int rc = 0; - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(elfcorehdr_addr, elfcorehdr_size); + if (rc) + return rc; + /* +* If elfcorehdr= has been passed in cmdline or created in 2nd kernel, +* then capture the dump. +*/ if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); @@ -943,7 +982,11 @@ static int __init vmcore_init(void) pr_warn(Kdump: vmcore not initialized\n); return rc; } - + elfcorehdr_free(elfcorehdr_addr); + /* +* elfcorehdr_addr must not be set to NULL here to keep +* is_kdump_kernel() working. +*/ proc_vmcore = proc_create(vmcore, S_IRUSR, NULL, proc_vmcore_operations); if (proc_vmcore) proc_vmcore-size = vmcore_size; --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -12,6 +12,12 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern int __weak elfcorehdr_alloc(unsigned long long *addr, + unsigned long long *size); +extern void __weak elfcorehdr_free(unsigned long long addr); +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); + extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); ___ kexec mailing list ke...@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
> On Thur, 23 May 2013 Andrew Morton wrote: > > On Wed, 15 May 2013 07:46:36 -0500 Cliff Wickman wrote: > > Certain tests in walk_page_range() (specifically split_huge_page_pmd()) > > assume that all the mapped PFN's are backed with page structures. And this > > is > > not usually true for VM_PFNMAP areas. This can result in panics on kernel > > page faults when attempting to address those page structures. > > > > There are a half dozen callers of walk_page_range() that walk through > > a task's entire page table (as N. Horiguchi pointed out). So rather than > > change all of them, this patch changes just walk_page_range() to ignore > > VM_PFNMAP areas. > > > > The logic of hugetlb_vma() is moved back into walk_page_range(), as we > > want to test any vma in the range. > > > > VM_PFNMAP areas are used by: > > - graphics memory manager gpu/drm/drm_gem.c > > - global reference unit sgi-gru/grufile.c > > - sgi special memorychar/mspec.c > > - and probably several out-of-tree modules > > What are your thoughts on the urgency/scheduling of this fix? The panic can be caused by simply cat'ing /proc//smaps while an application has a VM_PFNMAP range. It happened in-house when a benchmarker was trying to decipher the memory layout of his program. So that makes it rather urgent from our point of view. We would like to see the fix included in upcoming distro releases, and having it upstream makes that much easier to accomplish. > (Just to be irritating: "When writing a changelog, please describe the > end-user-visible effects of the bug, so that others can more easily > decide which kernel version(s) should be fixed, and so that downstream > kernel maintainers can more easily work out whether this patch will fix > a problem which they or their customers are observing.") -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
On Thur, 23 May 2013 Andrew Morton a...@linux-foundation.org wrote: On Wed, 15 May 2013 07:46:36 -0500 Cliff Wickman c...@sgi.com wrote: Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules What are your thoughts on the urgency/scheduling of this fix? The panic can be caused by simply cat'ing /proc/pid/smaps while an application has a VM_PFNMAP range. It happened in-house when a benchmarker was trying to decipher the memory layout of his program. So that makes it rather urgent from our point of view. We would like to see the fix included in upcoming distro releases, and having it upstream makes that much easier to accomplish. (Just to be irritating: When writing a changelog, please describe the end-user-visible effects of the bug, so that others can more easily decide which kernel version(s) should be fixed, and so that downstream kernel maintainers can more easily work out whether this patch will fix a problem which they or their customers are observing.) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
/proc//smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. v2: - moves the VM_BUG_ON out of the loop - adds the needed test for vma->vm_start <= addr v3 adds comments to make this clearer, as N. Horiguchi recommends: > I recommend that you check VM_PFNMAP in the possible callers' side. > But this patch seems to solve your problem, so with properly commenting > this somewhere, I do not oppose it. Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman --- mm/pagewalk.c | 65 -- 1 file changed, 36 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -198,30 +182,53 @@ int walk_page_range(unsigned long addr, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* This function was not intended to be vma based. +* But there are vma special cases to be handled: +* - hugetlb vma's +* - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so do not allow split_huge_page_pmd(). +*/ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_pa
[PATCH v3] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
/proc/pid/smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. v2: - moves the VM_BUG_ON out of the loop - adds the needed test for vma-vm_start = addr v3 adds comments to make this clearer, as N. Horiguchi recommends: I recommend that you check VM_PFNMAP in the possible callers' side. But this patch seems to solve your problem, so with properly commenting this somewhere, I do not oppose it. Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman c...@sgi.com --- mm/pagewalk.c | 65 -- 1 file changed, 36 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk-hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); - vma = find_vma(walk-mm, addr); - if (vma vma-vm_start = addr is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -198,30 +182,53 @@ int walk_page_range(unsigned long addr, if (!walk-mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); + pgd = pgd_offset(walk-mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* This function was not intended to be vma based. +* But there are vma special cases to be handled: +* - hugetlb vma's +* - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk-mm, addr); if (vma) { - if (vma-vm_end next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so do not allow split_huge_page_pmd(). +*/ + if ((vma-vm_start = addr) + (vma-vm_flags VM_PFNMAP)) { next = vma-vm_end; + pgd = pgd_offset(walk-mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk-mm, next); - continue; + if (walk-hugetlb_entry (vma-vm_start = addr) + is_vm_hugetlb_page(vma)) { + if (vma-vm_end next) + next = vma-vm_end
makedumpfile mmap() benchmark
> Jingbai Ma wote on 27 Mar 2013: > I have tested the makedumpfile mmap patch on a machine with 2TB memory, > here is testing results: > Test environment: > Machine: HP ProLiant DL980 G7 with 2TB RAM. > CPU: Intel(R) Xeon(R) CPU E7- 2860 @ 2.27GHz (8 sockets, 10 cores) > (Only 1 cpu was enabled the 2nd kernel) > Kernel: 3.9.0-rc3+ with mmap kernel patch v3 > vmcore size: 2.0TB > Dump file size: 3.6GB > makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 > --map-size > All measured time from debug message of makedumpfile. > > As a comparison, I also have tested with original kernel and original > makedumpfile 1.5.1 and 1.5.3. > I added all [Excluding unnecessary pages] and [Excluding free pages] > time together as "Filter Pages", and [Copyying Data] as "Copy data" here. > > makedumjpfile Kernel map-size (KB) Filter pages (s)Copy data (s) > Total (s) > 1.5.1 3.7.0-0.36.el7.x86_64 N/A 940.28 1269.25 2209.53 > 1.5.3 3.7.0-0.36.el7.x86_64 N/A 380.09 992.77 1372.86 > 1.5.3 v3.9-rc3N/A 197.77 892.27 1090.04 > 1.5.3+mmapv3.9-rc3+mmap 0 164.87 606.06 770.93 > 1.5.3+mmapv3.9-rc3+mmap 4 88.62 576.07 664.69 > 1.5.3+mmapv3.9-rc3+mmap 102483.66 477.23 560.89 > 1.5.3+mmapv3.9-rc3+mmap 204883.44 477.21 560.65 > 1.5.3+mmapv3.9-rc3+mmap 10240 83.84 476.56 560.4 I have also tested the makedumpfile mmap patch on a machine with 2TB memory, here are the results: Test environment: Machine: SGI UV1000 with 2TB RAM. CPU: Intel(R) Xeon(R) CPU E7- 8837 @ 2.67GHz (only 1 cpu was enabled in the 2nd kernel) Kernel: 3.0.13 with mmap kernel patch v3 (I had to tweak the patch a bit) vmcore size: 2.0TB Dump file size: 3.6GB makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 --map-size All measured times are actual clock times. All tests are noncyclic. Crash kernel memory: crashkernel=512M As did Jingbai Ma, I also tested with an unpatched kernel and makedumpfile 1.5.1 and 1.5.3. But they do 2 filtering scans: unnecessary pages and free pages; here added together as filter pages time. FilterCopy makedumpfile Kernel map-size(KB) pages(s) data(s) Total(s) 1.5.13.0.13N/A671 5111182 1.5.33.0.13N/A294 535 829 1.5.3+mmap 3.0.13+mmap 0 54 506 560 1.5.3+mmap 3.0.13+mmap 4096 40 416 456 1.5.3+mmap 3.0.13+mmap 10240 37 424 461 Using mmap for the copy data as well as for filtering pages did little: 1.5.3+mmap 3.0.13+mmap 4096 37 414 451 My results are quite similar to Jingbai Ma's. The mmap patch to the kernel greatly speeds the filtering of pages, so we at SGI would very much like to see this patch in the 3.10 kernel. http://marc.info/?l=linux-kernel=136627770125345=2 What puzzles me is that the patch greatly speeds the read's of /proc/vmcore (where map-size is 0) as well as providing the mmap ability. I can now seek/read page structures almost as fast as mmap'ing and copying them. (versus Jingbai Ma's results where mmap almost doubled the speed of reads) I have put counters in to verify, and we are doing several million seek/read's vs. a few thousand mmap's. Yet the performance is similar (54sec vs. 37sec, above). I can't rationalize that much improvement. Thanks, Cliff Wickman -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
makedumpfile mmap() benchmark
Jingbai Ma wote on 27 Mar 2013: I have tested the makedumpfile mmap patch on a machine with 2TB memory, here is testing results: Test environment: Machine: HP ProLiant DL980 G7 with 2TB RAM. CPU: Intel(R) Xeon(R) CPU E7- 2860 @ 2.27GHz (8 sockets, 10 cores) (Only 1 cpu was enabled the 2nd kernel) Kernel: 3.9.0-rc3+ with mmap kernel patch v3 vmcore size: 2.0TB Dump file size: 3.6GB makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 --map-size map-size All measured time from debug message of makedumpfile. As a comparison, I also have tested with original kernel and original makedumpfile 1.5.1 and 1.5.3. I added all [Excluding unnecessary pages] and [Excluding free pages] time together as Filter Pages, and [Copyying Data] as Copy data here. makedumjpfile Kernel map-size (KB) Filter pages (s)Copy data (s) Total (s) 1.5.1 3.7.0-0.36.el7.x86_64 N/A 940.28 1269.25 2209.53 1.5.3 3.7.0-0.36.el7.x86_64 N/A 380.09 992.77 1372.86 1.5.3 v3.9-rc3N/A 197.77 892.27 1090.04 1.5.3+mmapv3.9-rc3+mmap 0 164.87 606.06 770.93 1.5.3+mmapv3.9-rc3+mmap 4 88.62 576.07 664.69 1.5.3+mmapv3.9-rc3+mmap 102483.66 477.23 560.89 1.5.3+mmapv3.9-rc3+mmap 204883.44 477.21 560.65 1.5.3+mmapv3.9-rc3+mmap 10240 83.84 476.56 560.4 I have also tested the makedumpfile mmap patch on a machine with 2TB memory, here are the results: Test environment: Machine: SGI UV1000 with 2TB RAM. CPU: Intel(R) Xeon(R) CPU E7- 8837 @ 2.67GHz (only 1 cpu was enabled in the 2nd kernel) Kernel: 3.0.13 with mmap kernel patch v3 (I had to tweak the patch a bit) vmcore size: 2.0TB Dump file size: 3.6GB makedumpfile mmap branch with parameters: -c --message-level 23 -d 31 --map-size map-size All measured times are actual clock times. All tests are noncyclic. Crash kernel memory: crashkernel=512M As did Jingbai Ma, I also tested with an unpatched kernel and makedumpfile 1.5.1 and 1.5.3. But they do 2 filtering scans: unnecessary pages and free pages; here added together as filter pages time. FilterCopy makedumpfile Kernel map-size(KB) pages(s) data(s) Total(s) 1.5.13.0.13N/A671 5111182 1.5.33.0.13N/A294 535 829 1.5.3+mmap 3.0.13+mmap 0 54 506 560 1.5.3+mmap 3.0.13+mmap 4096 40 416 456 1.5.3+mmap 3.0.13+mmap 10240 37 424 461 Using mmap for the copy data as well as for filtering pages did little: 1.5.3+mmap 3.0.13+mmap 4096 37 414 451 My results are quite similar to Jingbai Ma's. The mmap patch to the kernel greatly speeds the filtering of pages, so we at SGI would very much like to see this patch in the 3.10 kernel. http://marc.info/?l=linux-kernelm=136627770125345w=2 What puzzles me is that the patch greatly speeds the read's of /proc/vmcore (where map-size is 0) as well as providing the mmap ability. I can now seek/read page structures almost as fast as mmap'ing and copying them. (versus Jingbai Ma's results where mmap almost doubled the speed of reads) I have put counters in to verify, and we are doing several million seek/read's vs. a few thousand mmap's. Yet the performance is similar (54sec vs. 37sec, above). I can't rationalize that much improvement. Thanks, Cliff Wickman -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
On Thu, May 02, 2013 at 12:44:04PM -0400, Naoya Horiguchi wrote: > On Thu, May 02, 2013 at 07:10:48AM -0500, Cliff Wickman wrote: > > > > /proc//smaps and similar walks through a user page table should not > > be looking at VM_PFNMAP areas. > > > > This is v2: > > - moves the VM_BUG_ON out of the loop > > - adds the needed test for vma->vm_start <= addr > > > > Certain tests in walk_page_range() (specifically split_huge_page_pmd()) > > assume that all the mapped PFN's are backed with page structures. And this > > is > > not usually true for VM_PFNMAP areas. This can result in panics on kernel > > page faults when attempting to address those page structures. > > > > There are a half dozen callers of walk_page_range() that walk through > > a task's entire page table (as N. Horiguchi pointed out). So rather than > > change all of them, this patch changes just walk_page_range() to ignore > > VM_PFNMAP areas. > > > > The logic of hugetlb_vma() is moved back into walk_page_range(), as we > > want to test any vma in the range. > > > > VM_PFNMAP areas are used by: > > - graphics memory manager gpu/drm/drm_gem.c > > - global reference unit sgi-gru/grufile.c > > - sgi special memorychar/mspec.c > > - and probably several out-of-tree modules > > > > I'm copying everyone who has changed this file recently, in case > > there is some reason that I am not aware of to provide > > /proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. > > > > Signed-off-by: Cliff Wickman > > walk_page_range() does vma-based walk only for address ranges backed by > hugetlbfs, and it doesn't see vma for address ranges backed by normal pages > and thps (in those case we just walk over page table hierarchy). Agreed, walk_page_range() only checks for a hugetlbfs-type vma as it scans an address range. The problem I'm seeing comes in when it calls walk_pud_range() for any address range that is not within a hugetlbfs vma: walk_pmd_range() split_huge_page_pmd_mm() split_huge_page_pmd() __split_huge_page_pmd() page = pmd_page(*pmd) And such a page structure does not exist for a VM_PFNMAP area. > I think that vma-based walk was introduced as a kind of dirty hack to > handle hugetlbfs, and it can be cleaned up in the future. So I'm afraid > it's not a good idea to extend or adding code heavily depending on this hack. walk_page_range() looks like generic infrastructure to scan any range of a user's address space - as in /proc//smaps and similar. And the hugetlbfs check seems to have been added as an exception. Huge page exceptional cases occur further down the chain. And when a corresponding page structure is needed for those cases we run into the problem. I'm not depending on walk_page_range(). I'm just trying to survive the case where it is scanning a VM_PFNMAP range. > I recommend that you check VM_PFNMAP in the possible callers' side. > But this patch seems to solve your problem, so with properly commenting > this somewhere, I do not oppose it. Agreed, it could be handled by checking at several points higher up. But checking at this common point seems more straightforward to me. -Cliff > > Thanks, > Naoya Horiguchi -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
/proc//smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. This is v2: - moves the VM_BUG_ON out of the loop - adds the needed test for vma->vm_start <= addr Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman --- mm/pagewalk.c | 62 ++ 1 file changed, 33 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -198,30 +182,50 @@ int walk_page_range(unsigned long addr, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* Check any special vma's within this range. */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so do not allow split_huge_page_pmd(). +*/ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* +* Hugepage is very tightly coupled with vma, +* so walk through hugetlb entries within a +* given vma. +*/ + err = walk
[PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
/proc/pid/smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. This is v2: - moves the VM_BUG_ON out of the loop - adds the needed test for vma-vm_start = addr Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman c...@sgi.com --- mm/pagewalk.c | 62 ++ 1 file changed, 33 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk-hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); - vma = find_vma(walk-mm, addr); - if (vma vma-vm_start = addr is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -198,30 +182,50 @@ int walk_page_range(unsigned long addr, if (!walk-mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); + pgd = pgd_offset(walk-mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* Check any special vma's within this range. */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk-mm, addr); if (vma) { - if (vma-vm_end next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so do not allow split_huge_page_pmd(). +*/ + if ((vma-vm_start = addr) + (vma-vm_flags VM_PFNMAP)) { next = vma-vm_end; + pgd = pgd_offset(walk-mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk-mm, next); - continue; + if (walk-hugetlb_entry (vma-vm_start = addr) + is_vm_hugetlb_page(vma)) { + if (vma-vm_end next) + next = vma-vm_end; + /* +* Hugepage is very tightly coupled with vma, +* so walk through hugetlb entries within a +* given vma. +*/ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break
Re: [PATCH v2] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
On Thu, May 02, 2013 at 12:44:04PM -0400, Naoya Horiguchi wrote: On Thu, May 02, 2013 at 07:10:48AM -0500, Cliff Wickman wrote: /proc/pid/smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. This is v2: - moves the VM_BUG_ON out of the loop - adds the needed test for vma-vm_start = addr Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman c...@sgi.com walk_page_range() does vma-based walk only for address ranges backed by hugetlbfs, and it doesn't see vma for address ranges backed by normal pages and thps (in those case we just walk over page table hierarchy). Agreed, walk_page_range() only checks for a hugetlbfs-type vma as it scans an address range. The problem I'm seeing comes in when it calls walk_pud_range() for any address range that is not within a hugetlbfs vma: walk_pmd_range() split_huge_page_pmd_mm() split_huge_page_pmd() __split_huge_page_pmd() page = pmd_page(*pmd) And such a page structure does not exist for a VM_PFNMAP area. I think that vma-based walk was introduced as a kind of dirty hack to handle hugetlbfs, and it can be cleaned up in the future. So I'm afraid it's not a good idea to extend or adding code heavily depending on this hack. walk_page_range() looks like generic infrastructure to scan any range of a user's address space - as in /proc/pid/smaps and similar. And the hugetlbfs check seems to have been added as an exception. Huge page exceptional cases occur further down the chain. And when a corresponding page structure is needed for those cases we run into the problem. I'm not depending on walk_page_range(). I'm just trying to survive the case where it is scanning a VM_PFNMAP range. I recommend that you check VM_PFNMAP in the possible callers' side. But this patch seems to solve your problem, so with properly commenting this somewhere, I do not oppose it. Agreed, it could be handled by checking at several points higher up. But checking at this common point seems more straightforward to me. -Cliff Thanks, Naoya Horiguchi -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
On Wed, May 01, 2013 at 08:47:02AM -0700, David Rientjes wrote: > On Wed, 1 May 2013, Cliff Wickman wrote: > > > Index: linux/mm/pagewalk.c > > === > > --- linux.orig/mm/pagewalk.c > > +++ linux/mm/pagewalk.c > > @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ > > return 0; > > } > > > > -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct > > mm_walk *walk) > > -{ > > - struct vm_area_struct *vma; > > - > > - /* We don't need vma lookup at all. */ > > - if (!walk->hugetlb_entry) > > - return NULL; > > - > > - VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); > > - vma = find_vma(walk->mm, addr); > > - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) > > - return vma; > > - > > - return NULL; > > -} > > - > > #else /* CONFIG_HUGETLB_PAGE */ > > static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct > > mm_walk *walk) > > { > > @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr, > > > > pgd = pgd_offset(walk->mm, addr); > > do { > > - struct vm_area_struct *vma; > > + struct vm_area_struct *vma = NULL; > > > > next = pgd_addr_end(addr, end); > > > > /* > > -* handle hugetlb vma individually because pagetable walk for > > -* the hugetlb page is dependent on the architecture and > > -* we can't handled it in the same manner as non-huge pages. > > +* Check any special vma's within this range. > > */ > > - vma = hugetlb_vma(addr, walk); > > + VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); > > I think this should be moved out of the iteration. It's currently inside > it even before your patch, but I think it's pointless. I don't follow. We are iterating through a range of addresses. When we come to a range that is VM_PFNMAP we skip it. How can we take that out of the iteration? > > + vma = find_vma(walk->mm, addr); > > if (vma) { > > - if (vma->vm_end < next) > > + /* > > +* There are no page structures backing a VM_PFNMAP > > +* range, so allow no split_huge_page_pmd(). > > +*/ > > + if (vma->vm_flags & VM_PFNMAP) { > > next = vma->vm_end; > > + pgd = pgd_offset(walk->mm, next); > > + continue; > > + } > > What if end < vma->vm_end? Yes, a bad omission. Thanks for pointing that out. It should be if ((vma->vm_start <= addr) && (vma->vm_flags & VM_PFNMAP)) as find_vma can return a vma above the addr. -Cliff > > /* > > -* Hugepage is very tightly coupled with vma, so > > -* walk through hugetlb entries within a given vma. > > +* Handle hugetlb vma individually because pagetable > > +* walk for the hugetlb page is dependent on the > > +* architecture and we can't handled it in the same > > +* manner as non-huge pages. > > */ > > - err = walk_hugetlb_range(vma, addr, next, walk); > > - if (err) > > - break; > > - pgd = pgd_offset(walk->mm, next); > > - continue; > > + if (walk->hugetlb_entry && (vma->vm_start <= addr) && > > + is_vm_hugetlb_page(vma)) { > > + if (vma->vm_end < next) > > + next = vma->vm_end; > > + /* > > + * Hugepage is very tightly coupled with vma, > > +* so walk through hugetlb entries within a > > +* given vma. > > +*/ > > + err = walk_hugetlb_range(vma, addr, next, walk); > > + if (err) > > + break; > > + pgd = pgd_offset(walk->mm, next); > > + continue; > > + } > > } > > > > if (pgd_none_or_clear_bad(pgd)) { -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
This patch replaces "[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas". /proc//smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc//smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman --- mm/pagewalk.c | 60 +- 1 file changed, 31 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr, pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* Check any special vma's within this range. */ - vma = hugetlb_vma(addr, walk); + VM_BUG_ON(!rwsem_is_locked(>mm->mmap_sem)); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so allow no split_huge_page_pmd(). +*/ + if (vma->vm_flags & VM_PFNMAP) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* +* Hugepage is very tightly coupled with vma, +* so walk through hugetlb entries within a +* given vma. +*/ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + b
[PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
This patch replaces [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas. /proc/pid/smaps and similar walks through a user page table should not be looking at VM_PFNMAP areas. Certain tests in walk_page_range() (specifically split_huge_page_pmd()) assume that all the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. There are a half dozen callers of walk_page_range() that walk through a task's entire page table (as N. Horiguchi pointed out). So rather than change all of them, this patch changes just walk_page_range() to ignore VM_PFNMAP areas. The logic of hugetlb_vma() is moved back into walk_page_range(), as we want to test any vma in the range. VM_PFNMAP areas are used by: - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - and probably several out-of-tree modules I'm copying everyone who has changed this file recently, in case there is some reason that I am not aware of to provide /proc/pid/smaps|clear_refs|maps|numa_maps for these VM_PFNMAP areas. Signed-off-by: Cliff Wickman c...@sgi.com --- mm/pagewalk.c | 60 +- 1 file changed, 31 insertions(+), 29 deletions(-) Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk-hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); - vma = find_vma(walk-mm, addr); - if (vma vma-vm_start = addr is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr, pgd = pgd_offset(walk-mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* Check any special vma's within this range. */ - vma = hugetlb_vma(addr, walk); + VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); + vma = find_vma(walk-mm, addr); if (vma) { - if (vma-vm_end next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so allow no split_huge_page_pmd(). +*/ + if (vma-vm_flags VM_PFNMAP) { next = vma-vm_end; + pgd = pgd_offset(walk-mm, next); + continue; + } /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk-mm, next); - continue; + if (walk-hugetlb_entry (vma-vm_start = addr) + is_vm_hugetlb_page(vma)) { + if (vma-vm_end next) + next = vma-vm_end; + /* +* Hugepage is very tightly coupled with vma, +* so walk through hugetlb entries within a +* given vma. +*/ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk-mm, next); + continue
Re: [PATCH] mm/pagewalk.c: walk_page_range should avoid VM_PFNMAP areas
On Wed, May 01, 2013 at 08:47:02AM -0700, David Rientjes wrote: On Wed, 1 May 2013, Cliff Wickman wrote: Index: linux/mm/pagewalk.c === --- linux.orig/mm/pagewalk.c +++ linux/mm/pagewalk.c @@ -127,22 +127,6 @@ static int walk_hugetlb_range(struct vm_ return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk-hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); - vma = find_vma(walk-mm, addr); - if (vma vma-vm_start = addr is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { @@ -200,28 +184,46 @@ int walk_page_range(unsigned long addr, pgd = pgd_offset(walk-mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* -* handle hugetlb vma individually because pagetable walk for -* the hugetlb page is dependent on the architecture and -* we can't handled it in the same manner as non-huge pages. +* Check any special vma's within this range. */ - vma = hugetlb_vma(addr, walk); + VM_BUG_ON(!rwsem_is_locked(walk-mm-mmap_sem)); I think this should be moved out of the iteration. It's currently inside it even before your patch, but I think it's pointless. I don't follow. We are iterating through a range of addresses. When we come to a range that is VM_PFNMAP we skip it. How can we take that out of the iteration? + vma = find_vma(walk-mm, addr); if (vma) { - if (vma-vm_end next) + /* +* There are no page structures backing a VM_PFNMAP +* range, so allow no split_huge_page_pmd(). +*/ + if (vma-vm_flags VM_PFNMAP) { next = vma-vm_end; + pgd = pgd_offset(walk-mm, next); + continue; + } What if end vma-vm_end? Yes, a bad omission. Thanks for pointing that out. It should be if ((vma-vm_start = addr) (vma-vm_flags VM_PFNMAP)) as find_vma can return a vma above the addr. -Cliff /* -* Hugepage is very tightly coupled with vma, so -* walk through hugetlb entries within a given vma. +* Handle hugetlb vma individually because pagetable +* walk for the hugetlb page is dependent on the +* architecture and we can't handled it in the same +* manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk-mm, next); - continue; + if (walk-hugetlb_entry (vma-vm_start = addr) + is_vm_hugetlb_page(vma)) { + if (vma-vm_end next) + next = vma-vm_end; + /* +* Hugepage is very tightly coupled with vma, +* so walk through hugetlb entries within a +* given vma. +*/ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk-mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas
> On Tue, Apr 30, 2013 at 01:11:45PM -0500, Naoya Horiguchi wrote: > > > > /proc//smaps should not be looking at VM_PFNMAP areas. > > > > Certain tests in show_smap() (especially for huge pages) assume that the > > mapped PFN's are backed with page structures. And this is not usually true > > for VM_PFNMAP areas. This can result in panics on kernel page faults when > > attempting to address those page structures. > > I think it's strange that you mention to hugepages, because in my > understanding > VM_PFNMAP and hugepage related vma (VM_HUGEPAGE or VM_HUGETLB) should not set > at the same time. In what testcase are these flags both set? I don't think VM_PFNMAP and VM_HUGE... set at the same time. The problem is that a VM_PFNMAP'd area might have 2MB mappings in its page table, but they may point to pfn's that are not backed by page structures. Then a sequence like: show_smap show_map_vma walk_page_range walk_pud_range walk_pmd_range split_huge_page_pmd(walk->mm, pmd) __split_huge_page_pmd page = pmd_page(*pmd) can address (vmemmap + (pfn)) and panic Or a sequence like this: walk_pmd_range walk->pmd_entry(pmd, addr, next, walk) smaps_pte_range smaps_pte_entry(*pte, addr, PAGE_SIZE, walk) page = vm_normal_page(vma, addr, ptent) return pfn_to_page(pfn) > > And I guess this race can also happen on reading pagemap or numa_maps because > walk_page_range() is called in those code paths. Are you sure the race doesn't > happen on these paths? If not, please add a few more flag checks for them. Okay. I'll check and submit a version 2 of this patch. -Cliff Wickman > Thanks, > Naoya Horiguchi > > > > VM_PFNMAP areas are used by > > - graphics memory manager gpu/drm/drm_gem.c > > - global reference unit sgi-gru/grufile.c > > - sgi special memorychar/mspec.c > > - probably several out-of-tree modules > > > > I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case > > of some reason to provide /proc//smaps for these areas that I am not > > aware of. > > > > Signed-off-by: Cliff Wickman > > --- > > fs/proc/task_mmu.c |3 +++ > > 1 file changed, 3 insertions(+) > > > > Index: linux/fs/proc/task_mmu.c > > === > > --- linux.orig/fs/proc/task_mmu.c > > +++ linux/fs/proc/task_mmu.c > > @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m, > > .private = , > > }; > > > > + if (vma->vm_flags & VM_PFNMAP) > > + return 0; > > + > > memset(, 0, sizeof mss); > > mss.vma = vma; > > /* mmap_sem is held in m_start */ -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/proc: smaps should avoid VM_PFNMAP areas
On Tue, Apr 30, 2013 at 01:11:45PM -0500, Naoya Horiguchi wrote: /proc/pid/smaps should not be looking at VM_PFNMAP areas. Certain tests in show_smap() (especially for huge pages) assume that the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. I think it's strange that you mention to hugepages, because in my understanding VM_PFNMAP and hugepage related vma (VM_HUGEPAGE or VM_HUGETLB) should not set at the same time. In what testcase are these flags both set? I don't think VM_PFNMAP and VM_HUGE... set at the same time. The problem is that a VM_PFNMAP'd area might have 2MB mappings in its page table, but they may point to pfn's that are not backed by page structures. Then a sequence like: show_smap show_map_vma walk_page_range walk_pud_range walk_pmd_range split_huge_page_pmd(walk-mm, pmd) __split_huge_page_pmd page = pmd_page(*pmd) can address (vmemmap + (pfn)) and panic Or a sequence like this: walk_pmd_range walk-pmd_entry(pmd, addr, next, walk) smaps_pte_range smaps_pte_entry(*pte, addr, PAGE_SIZE, walk) page = vm_normal_page(vma, addr, ptent) return pfn_to_page(pfn) And I guess this race can also happen on reading pagemap or numa_maps because walk_page_range() is called in those code paths. Are you sure the race doesn't happen on these paths? If not, please add a few more flag checks for them. Okay. I'll check and submit a version 2 of this patch. -Cliff Wickman Thanks, Naoya Horiguchi VM_PFNMAP areas are used by - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - probably several out-of-tree modules I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case of some reason to provide /proc/pid/smaps for these areas that I am not aware of. Signed-off-by: Cliff Wickman c...@sgi.com --- fs/proc/task_mmu.c |3 +++ 1 file changed, 3 insertions(+) Index: linux/fs/proc/task_mmu.c === --- linux.orig/fs/proc/task_mmu.c +++ linux/fs/proc/task_mmu.c @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m, .private = mss, }; + if (vma-vm_flags VM_PFNMAP) + return 0; + memset(mss, 0, sizeof mss); mss.vma = vma; /* mmap_sem is held in m_start */ -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas
/proc//smaps should not be looking at VM_PFNMAP areas. Certain tests in show_smap() (especially for huge pages) assume that the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. VM_PFNMAP areas are used by - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - probably several out-of-tree modules I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case of some reason to provide /proc//smaps for these areas that I am not aware of. Signed-off-by: Cliff Wickman --- fs/proc/task_mmu.c |3 +++ 1 file changed, 3 insertions(+) Index: linux/fs/proc/task_mmu.c === --- linux.orig/fs/proc/task_mmu.c +++ linux/fs/proc/task_mmu.c @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m, .private = , }; + if (vma->vm_flags & VM_PFNMAP) + return 0; + memset(, 0, sizeof mss); mss.vma = vma; /* mmap_sem is held in m_start */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] fs/proc: smaps should avoid VM_PFNMAP areas
/proc/pid/smaps should not be looking at VM_PFNMAP areas. Certain tests in show_smap() (especially for huge pages) assume that the mapped PFN's are backed with page structures. And this is not usually true for VM_PFNMAP areas. This can result in panics on kernel page faults when attempting to address those page structures. VM_PFNMAP areas are used by - graphics memory manager gpu/drm/drm_gem.c - global reference unit sgi-gru/grufile.c - sgi special memorychar/mspec.c - probably several out-of-tree modules I'm copying everyone who has changed fs/proc/task_mmu.c recently, in case of some reason to provide /proc/pid/smaps for these areas that I am not aware of. Signed-off-by: Cliff Wickman c...@sgi.com --- fs/proc/task_mmu.c |3 +++ 1 file changed, 3 insertions(+) Index: linux/fs/proc/task_mmu.c === --- linux.orig/fs/proc/task_mmu.c +++ linux/fs/proc/task_mmu.c @@ -589,6 +589,9 @@ static int show_smap(struct seq_file *m, .private = mss, }; + if (vma-vm_flags VM_PFNMAP) + return 0; + memset(mss, 0, sizeof mss); mss.vma = vma; /* mmap_sem is held in m_start */ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v4 0/8] kdump, vmcore: support mmap() on /proc/vmcore
On Fri, Apr 05, 2013 at 12:04:02AM +, HATAYAMA Daisuke wrote: > Currently, read to /proc/vmcore is done by read_oldmem() that uses > ioremap/iounmap per a single page. For example, if memory is 1GB, > ioremap/iounmap is called (1GB / 4KB)-times, that is, 262144 > times. This causes big performance degradation. > > In particular, the current main user of this mmap() is makedumpfile, > which not only reads memory from /proc/vmcore but also does other > processing like filtering, compression and IO work. > > To address the issue, this patch implements mmap() on /proc/vmcore to > improve read performance. > > Benchmark > = > > You can see two benchmarks on terabyte memory system. Both show about > 40 seconds on 2TB system. This is almost equal to performance by > experimtanal kernel-side memory filtering. > > - makedumpfile mmap() benchmark, by Jingbai Ma > https://lkml.org/lkml/2013/3/27/19 > > - makedumpfile: benchmark on mmap() with /proc/vmcore on 2TB memory system > https://lkml.org/lkml/2013/3/26/914 > > ChangeLog > = > > v3 => v4) > > - Rebase 3.9-rc7. > - Drop clean-up patches orthogonal to the main topic of this patch set. > - Copy ELF note segments in the 1st kernel just as in v1. Allocate > vmcore objects per pages. => See [PATCH 5/8] > - Map memory referenced by PT_LOAD entry directly even if the start or > end of the region doesn't fit inside page boundary, no longer copy > them as the previous v3. Then, holes, outside OS memory, are visible > from /proc/vmcore. => See [PATCH 7/8] > > v2 => v3) > > - Rebase 3.9-rc3. > - Copy program headers seprately from e_phoff in ELF note segment > buffer. Now there's no risk to allocate huge memory if program > header table positions after memory segment. > - Add cleanup patch that removes unnecessary variable. > - Fix wrongly using the variable that is buffer size configurable at > runtime. Instead, use the varibale that has original buffer size. > > v1 => v2) > > - Clean up the existing codes: use e_phoff, and remove the assumption > on PT_NOTE entries. > - Fix potencial bug that ELF haeader size is not included in exported > vmcoreinfo size. > - Divide patch modifying read_vmcore() into two: clean-up and primary > code change. > - Put ELF note segments in page-size boundary on the 1st kernel > instead of copying them into the buffer on the 2nd kernel. > > Test > > > This patch set is composed based on v3.9-rc7. > > Done on x86-64, x86-32 both with 1GB and over 4GB memory environments. > > --- > > HATAYAMA Daisuke (8): > vmcore: support mmap() on /proc/vmcore > vmcore: treat memory chunks referenced by PT_LOAD program header > entries in \ > page-size boundary in vmcore_list > vmcore: count holes generated by round-up operation for page boudary > for size \ > of /proc/vmcore > vmcore: copy ELF note segments in the 2nd kernel per page vmcore objects > vmcore: Add helper function vmcore_add() > vmcore, procfs: introduce MEM_TYPE_CURRENT_KERNEL flag to distinguish > objects \ > copied in 2nd kernel vmcore: clean up read_vmcore() > vmcore: allocate buffer for ELF headers on page-size alignment > > > fs/proc/vmcore.c| 349 > --- > include/linux/proc_fs.h |8 + > 2 files changed, 245 insertions(+), 112 deletions(-) > > -- > > Thanks. > HATAYAMA, Daisuke This is a very important patch set for speeding the kdump process. (patches 1 - 8) We have found the mmap interface to /proc/vmcore about 80x faster than the read interface. That is, doing mmap's and copying data (in pieces the size of page structures) transfers all of /proc/vmcore about 80 times faster than reading it. This greatly speeds up the capture of a kdump, as the scan of page structures takes the bulk of the time in dumping the OS on a machine with terabytes of memory. We would very much like to see this set make it into the 3.10 release. Acked-by: Cliff Wickman -Cliff -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v4 0/8] kdump, vmcore: support mmap() on /proc/vmcore
On Fri, Apr 05, 2013 at 12:04:02AM +, HATAYAMA Daisuke wrote: Currently, read to /proc/vmcore is done by read_oldmem() that uses ioremap/iounmap per a single page. For example, if memory is 1GB, ioremap/iounmap is called (1GB / 4KB)-times, that is, 262144 times. This causes big performance degradation. In particular, the current main user of this mmap() is makedumpfile, which not only reads memory from /proc/vmcore but also does other processing like filtering, compression and IO work. To address the issue, this patch implements mmap() on /proc/vmcore to improve read performance. Benchmark = You can see two benchmarks on terabyte memory system. Both show about 40 seconds on 2TB system. This is almost equal to performance by experimtanal kernel-side memory filtering. - makedumpfile mmap() benchmark, by Jingbai Ma https://lkml.org/lkml/2013/3/27/19 - makedumpfile: benchmark on mmap() with /proc/vmcore on 2TB memory system https://lkml.org/lkml/2013/3/26/914 ChangeLog = v3 = v4) - Rebase 3.9-rc7. - Drop clean-up patches orthogonal to the main topic of this patch set. - Copy ELF note segments in the 1st kernel just as in v1. Allocate vmcore objects per pages. = See [PATCH 5/8] - Map memory referenced by PT_LOAD entry directly even if the start or end of the region doesn't fit inside page boundary, no longer copy them as the previous v3. Then, holes, outside OS memory, are visible from /proc/vmcore. = See [PATCH 7/8] v2 = v3) - Rebase 3.9-rc3. - Copy program headers seprately from e_phoff in ELF note segment buffer. Now there's no risk to allocate huge memory if program header table positions after memory segment. - Add cleanup patch that removes unnecessary variable. - Fix wrongly using the variable that is buffer size configurable at runtime. Instead, use the varibale that has original buffer size. v1 = v2) - Clean up the existing codes: use e_phoff, and remove the assumption on PT_NOTE entries. - Fix potencial bug that ELF haeader size is not included in exported vmcoreinfo size. - Divide patch modifying read_vmcore() into two: clean-up and primary code change. - Put ELF note segments in page-size boundary on the 1st kernel instead of copying them into the buffer on the 2nd kernel. Test This patch set is composed based on v3.9-rc7. Done on x86-64, x86-32 both with 1GB and over 4GB memory environments. --- HATAYAMA Daisuke (8): vmcore: support mmap() on /proc/vmcore vmcore: treat memory chunks referenced by PT_LOAD program header entries in \ page-size boundary in vmcore_list vmcore: count holes generated by round-up operation for page boudary for size \ of /proc/vmcore vmcore: copy ELF note segments in the 2nd kernel per page vmcore objects vmcore: Add helper function vmcore_add() vmcore, procfs: introduce MEM_TYPE_CURRENT_KERNEL flag to distinguish objects \ copied in 2nd kernel vmcore: clean up read_vmcore() vmcore: allocate buffer for ELF headers on page-size alignment fs/proc/vmcore.c| 349 --- include/linux/proc_fs.h |8 + 2 files changed, 245 insertions(+), 112 deletions(-) -- Thanks. HATAYAMA, Daisuke This is a very important patch set for speeding the kdump process. (patches 1 - 8) We have found the mmap interface to /proc/vmcore about 80x faster than the read interface. That is, doing mmap's and copying data (in pieces the size of page structures) transfers all of /proc/vmcore about 80 times faster than reading it. This greatly speeds up the capture of a kdump, as the scan of page structures takes the bulk of the time in dumping the OS on a machine with terabytes of memory. We would very much like to see this set make it into the 3.10 release. Acked-by: Cliff Wickman c...@sgi.com -Cliff -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote: > On 03/07/2013 05:50 AM, Cliff Wickman wrote: >> From: Cliff Wickman >> >> Allocating a large number of 1GB hugetlbfs pages at boot takes a >> very long time. >> >> Large system sites would at times like to allocate a very large amount of >> memory as 1GB pages. They would put this on the kernel boot line: >> default_hugepagesz=1G hugepagesz=1G hugepages=4096 >> [Dynamic allocation of 1G pages is not an option, as zone pages only go >> up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] >> >> Each page is zeroed as it is allocated, and all allocation is done by >> cpu 0, as this path is early in boot: > > How you confirm they are done by cpu 0? just cpu 0 works during boot? Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just before the call to smp_init(). It is smp_init() that starts the other cpus. They don't come out of reset until then. >>start_kernel >> kernel_init >>do_pre_smp_initcalls >> hugetlb_init >>hugetlb_init_hstates >> hugetlb_hstate_alloc_pages >> >> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode >> on large numa systems). >> This estimate is approximate (it depends on core frequency & number of hops >> to remote memory) but should be within a factor of 2 on most systems. >> A benchmark attempting to reserve a TB for 1GB pages would thus require >> ~1000 seconds of boot time just for this allocating. 32TB would take 8 >> hours. >> >> I propose passing a flag to the early allocator to indicate that no zeroing >> of a page should be done. The 'no zeroing' flag would have to be passed >> down this code path: >> >>hugetlb_hstate_alloc_pages >> alloc_bootmem_huge_page >>__alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) >> __alloc_memory_core_early NO_ZERO >>if (!(flags & NO_ZERO)) >> memset(ptr, 0, size); >> >> Or this path if CONFIG_NO_BOOTMEM is not set: >> >>hugetlb_hstate_alloc_pages >> alloc_bootmem_huge_page >>__alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) >> alloc_bootmem_core NO_ZERO >>if (!(flags & NO_ZERO)) >> memset(region, 0, size); >> __alloc_bootmem_nopanic NO_ZERO >>___alloc_bootmem_nopanic NO_ZERO >> alloc_bootmem_core NO_ZERO >>if (!(flags & NO_ZERO)) >> memset(region, 0, size); >> >> Signed-off-by: Cliff Wickman >> >> --- >> arch/x86/kernel/setup_percpu.c |4 ++-- >> include/linux/bootmem.h| 23 --- >> mm/bootmem.c | 12 +++- >> mm/hugetlb.c |3 ++- >> mm/nobootmem.c | 41 >> +++-- >> mm/page_cgroup.c |2 +- >> mm/sparse.c|2 +- >> 7 files changed, 52 insertions(+), 35 deletions(-) >> >> Index: linux/include/linux/bootmem.h >> === >> --- linux.orig/include/linux/bootmem.h >> +++ linux/include/linux/bootmem.h >> @@ -8,6 +8,11 @@ >> #include >> /* >> + * allocation flags >> + */ >> +#define NO_ZERO 0x0001 >> + >> +/* >>* simple boot-time physical memory area allocator. >>*/ >> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo >> unsigned long goal); >> extern void *__alloc_bootmem_nopanic(unsigned long size, >> unsigned long align, >> - unsigned long goal); >> + unsigned long goal, >> + u32 flags); >> extern void *__alloc_bootmem_node(pg_data_t *pgdat, >>unsigned long size, >>unsigned long align, >> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ >> extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, >>unsigned long size, >>unsigned long align, >> - unsigned long goal); >> + unsigned long goal, >> + u32 flags); >
Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote: On 03/07/2013 05:50 AM, Cliff Wickman wrote: From: Cliff Wickman c...@sgi.com Allocating a large number of 1GB hugetlbfs pages at boot takes a very long time. Large system sites would at times like to allocate a very large amount of memory as 1GB pages. They would put this on the kernel boot line: default_hugepagesz=1G hugepagesz=1G hugepages=4096 [Dynamic allocation of 1G pages is not an option, as zone pages only go up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] Each page is zeroed as it is allocated, and all allocation is done by cpu 0, as this path is early in boot: How you confirm they are done by cpu 0? just cpu 0 works during boot? Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just before the call to smp_init(). It is smp_init() that starts the other cpus. They don't come out of reset until then. start_kernel kernel_init do_pre_smp_initcalls hugetlb_init hugetlb_init_hstates hugetlb_hstate_alloc_pages Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode on large numa systems). This estimate is approximate (it depends on core frequency number of hops to remote memory) but should be within a factor of 2 on most systems. A benchmark attempting to reserve a TB for 1GB pages would thus require ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours. I propose passing a flag to the early allocator to indicate that no zeroing of a page should be done. The 'no zeroing' flag would have to be passed down this code path: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) __alloc_memory_core_early NO_ZERO if (!(flags NO_ZERO)) memset(ptr, 0, size); Or this path if CONFIG_NO_BOOTMEM is not set: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); __alloc_bootmem_nopanic NO_ZERO ___alloc_bootmem_nopanic NO_ZERO alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); Signed-off-by: Cliff Wickman c...@sgi.com --- arch/x86/kernel/setup_percpu.c |4 ++-- include/linux/bootmem.h| 23 --- mm/bootmem.c | 12 +++- mm/hugetlb.c |3 ++- mm/nobootmem.c | 41 +++-- mm/page_cgroup.c |2 +- mm/sparse.c|2 +- 7 files changed, 52 insertions(+), 35 deletions(-) Index: linux/include/linux/bootmem.h === --- linux.orig/include/linux/bootmem.h +++ linux/include/linux/bootmem.h @@ -8,6 +8,11 @@ #include asm/dma.h /* + * allocation flags + */ +#define NO_ZERO 0x0001 + +/* * simple boot-time physical memory area allocator. */ @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal, + u32 flags); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal, + u32 flags); void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + u32 flags); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg #define alloc_bootmem_align(x, align) \ __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_nopanic(x) \ -__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES
Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote: > On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman wrote: > > From: Cliff Wickman > > > > Allocating a large number of 1GB hugetlbfs pages at boot takes a > > very long time. > > > > Large system sites would at times like to allocate a very large amount of > > memory as 1GB pages. They would put this on the kernel boot line: > >default_hugepagesz=1G hugepagesz=1G hugepages=4096 > > [Dynamic allocation of 1G pages is not an option, as zone pages only go > > up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] > > > > Each page is zeroed as it is allocated, and all allocation is done by > > cpu 0, as this path is early in boot: > > start_kernel > > kernel_init > > do_pre_smp_initcalls > > hugetlb_init > > hugetlb_init_hstates > > hugetlb_hstate_alloc_pages > > > > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode > > on large numa systems). > > This estimate is approximate (it depends on core frequency & number of hops > > to remote memory) but should be within a factor of 2 on most systems. > > A benchmark attempting to reserve a TB for 1GB pages would thus require > > ~1000 seconds of boot time just for this allocating. 32TB would take 8 > > hours. > > > > I propose passing a flag to the early allocator to indicate that no zeroing > > of a page should be done. The 'no zeroing' flag would have to be passed > > down this code path: > > > > FYI: huge pages are cleared just after allocated, for instance, > clear_huge_page() in hugetlb_no_page() > > Hillf Yes, I should have added that comment to the changelog. And because this is true there is no need to clear a huge page at boot time. -Cliff > > hugetlb_hstate_alloc_pages > > alloc_bootmem_huge_page > > __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) > > __alloc_memory_core_early NO_ZERO > > if (!(flags & NO_ZERO)) > > memset(ptr, 0, size); > > > > Or this path if CONFIG_NO_BOOTMEM is not set: > > > > hugetlb_hstate_alloc_pages > > alloc_bootmem_huge_page > > __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) > > alloc_bootmem_core NO_ZERO > > if (!(flags & NO_ZERO)) > > memset(region, 0, size); > > __alloc_bootmem_nopanic NO_ZERO > > ___alloc_bootmem_nopanic NO_ZERO > > alloc_bootmem_core NO_ZERO > > if (!(flags & NO_ZERO)) > > memset(region, 0, size); > > > > Signed-off-by: Cliff Wickman > > > > --- > > arch/x86/kernel/setup_percpu.c |4 ++-- > > include/linux/bootmem.h| 23 --- > > mm/bootmem.c | 12 +++- > > mm/hugetlb.c |3 ++- > > mm/nobootmem.c | 41 > > +++-- > > mm/page_cgroup.c |2 +- > > mm/sparse.c|2 +- > > 7 files changed, 52 insertions(+), 35 deletions(-) > > > > Index: linux/include/linux/bootmem.h > > === > > --- linux.orig/include/linux/bootmem.h > > +++ linux/include/linux/bootmem.h > > @@ -8,6 +8,11 @@ > > #include > > > > /* > > + * allocation flags > > + */ > > +#define NO_ZERO0x0001 > > + > > +/* > > * simple boot-time physical memory area allocator. > > */ > > > > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo > > unsigned long goal); > > extern void *__alloc_bootmem_nopanic(unsigned long size, > > unsigned long align, > > -unsigned long goal); > > +unsigned long goal, > > +u32 flags); > > extern void *__alloc_bootmem_node(pg_data_t *pgdat, > > unsigned long size, > > unsigned long align, > > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ > > extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, > > unsigned long size, > > unsigned long align, > > -
Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote: On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman c...@sgi.com wrote: From: Cliff Wickman c...@sgi.com Allocating a large number of 1GB hugetlbfs pages at boot takes a very long time. Large system sites would at times like to allocate a very large amount of memory as 1GB pages. They would put this on the kernel boot line: default_hugepagesz=1G hugepagesz=1G hugepages=4096 [Dynamic allocation of 1G pages is not an option, as zone pages only go up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] Each page is zeroed as it is allocated, and all allocation is done by cpu 0, as this path is early in boot: start_kernel kernel_init do_pre_smp_initcalls hugetlb_init hugetlb_init_hstates hugetlb_hstate_alloc_pages Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode on large numa systems). This estimate is approximate (it depends on core frequency number of hops to remote memory) but should be within a factor of 2 on most systems. A benchmark attempting to reserve a TB for 1GB pages would thus require ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours. I propose passing a flag to the early allocator to indicate that no zeroing of a page should be done. The 'no zeroing' flag would have to be passed down this code path: FYI: huge pages are cleared just after allocated, for instance, clear_huge_page() in hugetlb_no_page() Hillf Yes, I should have added that comment to the changelog. And because this is true there is no need to clear a huge page at boot time. -Cliff hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) __alloc_memory_core_early NO_ZERO if (!(flags NO_ZERO)) memset(ptr, 0, size); Or this path if CONFIG_NO_BOOTMEM is not set: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); __alloc_bootmem_nopanic NO_ZERO ___alloc_bootmem_nopanic NO_ZERO alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); Signed-off-by: Cliff Wickman c...@sgi.com --- arch/x86/kernel/setup_percpu.c |4 ++-- include/linux/bootmem.h| 23 --- mm/bootmem.c | 12 +++- mm/hugetlb.c |3 ++- mm/nobootmem.c | 41 +++-- mm/page_cgroup.c |2 +- mm/sparse.c|2 +- 7 files changed, 52 insertions(+), 35 deletions(-) Index: linux/include/linux/bootmem.h === --- linux.orig/include/linux/bootmem.h +++ linux/include/linux/bootmem.h @@ -8,6 +8,11 @@ #include asm/dma.h /* + * allocation flags + */ +#define NO_ZERO0x0001 + +/* * simple boot-time physical memory area allocator. */ @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, -unsigned long goal); +unsigned long goal, +u32 flags); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal, + u32 flags); void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + u32 flags); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg #define alloc_bootmem_align(x
[PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
From: Cliff Wickman Allocating a large number of 1GB hugetlbfs pages at boot takes a very long time. Large system sites would at times like to allocate a very large amount of memory as 1GB pages. They would put this on the kernel boot line: default_hugepagesz=1G hugepagesz=1G hugepages=4096 [Dynamic allocation of 1G pages is not an option, as zone pages only go up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] Each page is zeroed as it is allocated, and all allocation is done by cpu 0, as this path is early in boot: start_kernel kernel_init do_pre_smp_initcalls hugetlb_init hugetlb_init_hstates hugetlb_hstate_alloc_pages Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode on large numa systems). This estimate is approximate (it depends on core frequency & number of hops to remote memory) but should be within a factor of 2 on most systems. A benchmark attempting to reserve a TB for 1GB pages would thus require ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours. I propose passing a flag to the early allocator to indicate that no zeroing of a page should be done. The 'no zeroing' flag would have to be passed down this code path: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) __alloc_memory_core_early NO_ZERO if (!(flags & NO_ZERO)) memset(ptr, 0, size); Or this path if CONFIG_NO_BOOTMEM is not set: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) alloc_bootmem_core NO_ZERO if (!(flags & NO_ZERO)) memset(region, 0, size); __alloc_bootmem_nopanic NO_ZERO ___alloc_bootmem_nopanic NO_ZERO alloc_bootmem_core NO_ZERO if (!(flags & NO_ZERO)) memset(region, 0, size); Signed-off-by: Cliff Wickman --- arch/x86/kernel/setup_percpu.c |4 ++-- include/linux/bootmem.h| 23 --- mm/bootmem.c | 12 +++- mm/hugetlb.c |3 ++- mm/nobootmem.c | 41 +++-- mm/page_cgroup.c |2 +- mm/sparse.c|2 +- 7 files changed, 52 insertions(+), 35 deletions(-) Index: linux/include/linux/bootmem.h === --- linux.orig/include/linux/bootmem.h +++ linux/include/linux/bootmem.h @@ -8,6 +8,11 @@ #include /* + * allocation flags + */ +#define NO_ZERO0x0001 + +/* * simple boot-time physical memory area allocator. */ @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, -unsigned long goal); +unsigned long goal, +u32 flags); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal, + u32 flags); void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + u32 flags); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg #define alloc_bootmem_align(x, align) \ __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT) + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT) + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, B
[PATCH] mm, x86: no zeroing of hugetlbfs pages at boot
From: Cliff Wickman c...@sgi.com Allocating a large number of 1GB hugetlbfs pages at boot takes a very long time. Large system sites would at times like to allocate a very large amount of memory as 1GB pages. They would put this on the kernel boot line: default_hugepagesz=1G hugepagesz=1G hugepages=4096 [Dynamic allocation of 1G pages is not an option, as zone pages only go up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.] Each page is zeroed as it is allocated, and all allocation is done by cpu 0, as this path is early in boot: start_kernel kernel_init do_pre_smp_initcalls hugetlb_init hugetlb_init_hstates hugetlb_hstate_alloc_pages Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode on large numa systems). This estimate is approximate (it depends on core frequency number of hops to remote memory) but should be within a factor of 2 on most systems. A benchmark attempting to reserve a TB for 1GB pages would thus require ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours. I propose passing a flag to the early allocator to indicate that no zeroing of a page should be done. The 'no zeroing' flag would have to be passed down this code path: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c) __alloc_memory_core_early NO_ZERO if (!(flags NO_ZERO)) memset(ptr, 0, size); Or this path if CONFIG_NO_BOOTMEM is not set: hugetlb_hstate_alloc_pages alloc_bootmem_huge_page __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c) alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); __alloc_bootmem_nopanic NO_ZERO ___alloc_bootmem_nopanic NO_ZERO alloc_bootmem_core NO_ZERO if (!(flags NO_ZERO)) memset(region, 0, size); Signed-off-by: Cliff Wickman c...@sgi.com --- arch/x86/kernel/setup_percpu.c |4 ++-- include/linux/bootmem.h| 23 --- mm/bootmem.c | 12 +++- mm/hugetlb.c |3 ++- mm/nobootmem.c | 41 +++-- mm/page_cgroup.c |2 +- mm/sparse.c|2 +- 7 files changed, 52 insertions(+), 35 deletions(-) Index: linux/include/linux/bootmem.h === --- linux.orig/include/linux/bootmem.h +++ linux/include/linux/bootmem.h @@ -8,6 +8,11 @@ #include asm/dma.h /* + * allocation flags + */ +#define NO_ZERO0x0001 + +/* * simple boot-time physical memory area allocator. */ @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo unsigned long goal); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, -unsigned long goal); +unsigned long goal, +u32 flags); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal); + unsigned long goal, + u32 flags); void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, - unsigned long limit); + unsigned long limit, + u32 flags); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg #define alloc_bootmem_align(x, align) \ __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT) + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT) #define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT) + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x
[PATCH] kdump: do not drop entire e820 in crash kernel
From: Cliff Wickman The crash kernel is not able to find its root device if that device is not on PCI 0. This is because it is booted with the command line option memmap=exactmap which currently clears the e820 table. So ACPI processing does not find reserved i/o spaces. This works for a device on PCI 0 because ACPI falls back to a legacy mode. But the error message " [Firmware Bug]: PCI: MMCONFIG at [mem 0x8000-0x80cf] not reserved in ACPI motherboard resources" is written to the log even in this functioning case. It fails for some devices on UV2, and only for UV2, because SGI seems to be the only manufacturer currently using the extended PCI(>0). The fix is to not drop the entire e820 table on a memmap=exactmap, but to preserve all the non-E820_RAM reservations that the BIOS has made. Signed-off-by: Cliff Wickman --- arch/x86/kernel/e820.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) Index: linus.current/arch/x86/kernel/e820.c === --- linus.current.orig/arch/x86/kernel/e820.c +++ linus.current/arch/x86/kernel/e820.c @@ -839,6 +839,8 @@ static int __init parse_memmap_opt(char { char *oldp; u64 start_at, mem_size; + int i; + struct e820entry *curp, *availp; if (!p) return -EINVAL; @@ -852,7 +854,17 @@ static int __init parse_memmap_opt(char */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + /* keep everything that was reserved by the BIOS */ + for (i = 0, curp = [0], availp = [0]; + i < e820.nr_map; i++, curp++) { + if (curp->type != E820_RAM) { + if (curp != availp) { + *availp = *curp; + availp++; + } + } + } + e820.nr_map = availp - [0]; userdef = 1; return 0; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] kdump: do not drop entire e820 in crash kernel
From: Cliff Wickman c...@sgi.com The crash kernel is not able to find its root device if that device is not on PCI 0. This is because it is booted with the command line option memmap=exactmap which currently clears the e820 table. So ACPI processing does not find reserved i/o spaces. This works for a device on PCI 0 because ACPI falls back to a legacy mode. But the error message [Firmware Bug]: PCI: MMCONFIG at [mem 0x8000-0x80cf] not reserved in ACPI motherboard resources is written to the log even in this functioning case. It fails for some devices on UV2, and only for UV2, because SGI seems to be the only manufacturer currently using the extended PCI(0). The fix is to not drop the entire e820 table on a memmap=exactmap, but to preserve all the non-E820_RAM reservations that the BIOS has made. Signed-off-by: Cliff Wickman c...@sgi.com --- arch/x86/kernel/e820.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) Index: linus.current/arch/x86/kernel/e820.c === --- linus.current.orig/arch/x86/kernel/e820.c +++ linus.current/arch/x86/kernel/e820.c @@ -839,6 +839,8 @@ static int __init parse_memmap_opt(char { char *oldp; u64 start_at, mem_size; + int i; + struct e820entry *curp, *availp; if (!p) return -EINVAL; @@ -852,7 +854,17 @@ static int __init parse_memmap_opt(char */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + /* keep everything that was reserved by the BIOS */ + for (i = 0, curp = e820.map[0], availp = e820.map[0]; + i e820.nr_map; i++, curp++) { + if (curp-type != E820_RAM) { + if (curp != availp) { + *availp = *curp; + availp++; + } + } + } + e820.nr_map = availp - e820.map[0]; userdef = 1; return 0; } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] makedumpfile: request the kernel do page scans
On Thu, Dec 20, 2012 at 12:22:14PM +0900, HATAYAMA Daisuke wrote: > From: Cliff Wickman > Subject: Re: [PATCH] makedumpfile: request the kernel do page scans > Date: Mon, 10 Dec 2012 09:36:14 -0600 > > On Mon, Dec 10, 2012 at 09:59:29AM +0900, HATAYAMA Daisuke wrote: > >> From: Cliff Wickman > >> Subject: Re: [PATCH] makedumpfile: request the kernel do page scans > >> Date: Mon, 19 Nov 2012 12:07:10 -0600 > >> > >> > On Fri, Nov 16, 2012 at 03:39:44PM -0500, Vivek Goyal wrote: > >> >> On Thu, Nov 15, 2012 at 04:52:40PM -0600, Cliff Wickman wrote: > > > > Hi Hatayama, > > > > If ioremap/iounmap is the bottleneck then perhaps you could do what > > my patch does: it consolidates all the ranges of physical addresses > > where the boot kernel's page structures reside (see make_kernel_mmap()) > > and passes them to the kernel, which then does a handfull of ioremaps's to > > cover all of them. Then /proc/vmcore could look up the already-mapped > > virtual address. > > (also note a kludge in get_mm_sparsemem() that verifies that each section > > of the mem_map spans contiguous ranges of page structures. I had > > trouble with some sections when I made that assumption) > > > > I'm attaching 3 patches that might be useful in your testing: > > - 121210.proc_vmcore2 my current patch that applies to the released > > makedumpfile 1.5.1 > > - 121207.vmcore_pagescans.sles applies to a 3.0.13 kernel > > - 121207.vmcore_pagescans.rhel applies to a 2.6.32 kernel > > > > I used the same patch set on the benchmark. > > BTW, I have continuously reservation issue, so I think I cannot use > terabyte memory machine at least in this year. > > Also, your patch set is doing ioremap per a chunk of memory map, > i.e. a number of consequtive pages at the same time. On your terabyte > machines, how large they are? We have memory consumption issue on the > 2nd kernel so we must decrease amount of memory used. But looking into > ioremap code quickly, it looks not using 2MB or 1GB pages to > remap. This means more than tera bytes page table is generated. Or > have you probably already investigated this? > > BTW, my idea to solve this issue are two: > > 1) make linear direct mapping for old memory, and acess the old memory > via the linear direct mapping, not by ioremap. > > - adding remap code in vmcore, or passing the regions that need to > be remapped using memmap= kernel option to tell the 2nd kenrel to > map them in addition. Good point. It would take over 30G of memory to map 16TB with 4k pages. I recently tried to dump such a memory and ran out of kernel memory -- no wonder! Do you have a patch for doing a linear direct mapping? Or can you name existing kernel infrastructure to do such mapping? I'm just looking for a jumpstart to enhance the patch. -Cliff > > Or, > > 2) Support 2MB or 1GB pages in ioremap. > > Thanks. > HATAYAMA, Daisuke -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] makedumpfile: request the kernel do page scans
On Thu, Dec 20, 2012 at 12:22:14PM +0900, HATAYAMA Daisuke wrote: From: Cliff Wickman c...@sgi.com Subject: Re: [PATCH] makedumpfile: request the kernel do page scans Date: Mon, 10 Dec 2012 09:36:14 -0600 On Mon, Dec 10, 2012 at 09:59:29AM +0900, HATAYAMA Daisuke wrote: From: Cliff Wickman c...@sgi.com Subject: Re: [PATCH] makedumpfile: request the kernel do page scans Date: Mon, 19 Nov 2012 12:07:10 -0600 On Fri, Nov 16, 2012 at 03:39:44PM -0500, Vivek Goyal wrote: On Thu, Nov 15, 2012 at 04:52:40PM -0600, Cliff Wickman wrote: Hi Hatayama, If ioremap/iounmap is the bottleneck then perhaps you could do what my patch does: it consolidates all the ranges of physical addresses where the boot kernel's page structures reside (see make_kernel_mmap()) and passes them to the kernel, which then does a handfull of ioremaps's to cover all of them. Then /proc/vmcore could look up the already-mapped virtual address. (also note a kludge in get_mm_sparsemem() that verifies that each section of the mem_map spans contiguous ranges of page structures. I had trouble with some sections when I made that assumption) I'm attaching 3 patches that might be useful in your testing: - 121210.proc_vmcore2 my current patch that applies to the released makedumpfile 1.5.1 - 121207.vmcore_pagescans.sles applies to a 3.0.13 kernel - 121207.vmcore_pagescans.rhel applies to a 2.6.32 kernel I used the same patch set on the benchmark. BTW, I have continuously reservation issue, so I think I cannot use terabyte memory machine at least in this year. Also, your patch set is doing ioremap per a chunk of memory map, i.e. a number of consequtive pages at the same time. On your terabyte machines, how large they are? We have memory consumption issue on the 2nd kernel so we must decrease amount of memory used. But looking into ioremap code quickly, it looks not using 2MB or 1GB pages to remap. This means more than tera bytes page table is generated. Or have you probably already investigated this? BTW, my idea to solve this issue are two: 1) make linear direct mapping for old memory, and acess the old memory via the linear direct mapping, not by ioremap. - adding remap code in vmcore, or passing the regions that need to be remapped using memmap= kernel option to tell the 2nd kenrel to map them in addition. Good point. It would take over 30G of memory to map 16TB with 4k pages. I recently tried to dump such a memory and ran out of kernel memory -- no wonder! Do you have a patch for doing a linear direct mapping? Or can you name existing kernel infrastructure to do such mapping? I'm just looking for a jumpstart to enhance the patch. -Cliff Or, 2) Support 2MB or 1GB pages in ioremap. Thanks. HATAYAMA, Daisuke -- Cliff Wickman SGI c...@sgi.com (651) 683-3824 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] UV: fix incorrect tlb flush all issue
From: Cliff Wickman (this was sent as an ack on 9/13, but with incorrect title and sign-off) Ack. But with the adjustment below. The 'end' argument was not declared long. I tested the patch on a UV. It has the effect of either clearing 1 or all TLBs in a cpu. I added some debugging to test for the cases when clearing all TLBs is overkill, and in practice it happens very seldom. Reported-by: Jan Beulich Signed-off-by: Alex Shi Signed-off-by: Cliff Wickman Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/uv/uv.h |2 +- arch/x86/platform/uv/tlb_uv.c | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) Index: linux/arch/x86/platform/uv/tlb_uv.c === --- linux.orig/arch/x86/platform/uv/tlb_uv.c +++ linux/arch/x86/platform/uv/tlb_uv.c @@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, unsigned int cpu) + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc->payload.address = start; + if (!end || (end - start) <= PAGE_SIZE) + bau_desc->payload.address = start; + else + bau_desc->payload.address = TLB_FLUSH_ALL; bau_desc->payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, Index: linux/arch/x86/include/asm/uv/uv.h === --- linux.orig/arch/x86/include/asm/uv/uv.h +++ linux/arch/x86/include/asm/uv/uv.h @@ -16,7 +16,7 @@ extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, -unsigned end, +unsigned long end, unsigned int cpu); #else /* X86_UV */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] UV: fix incorrect tlb flush all issue
From: Cliff Wickman c...@sgi.com (this was sent as an ack on 9/13, but with incorrect title and sign-off) Ack. But with the adjustment below. The 'end' argument was not declared long. I tested the patch on a UV. It has the effect of either clearing 1 or all TLBs in a cpu. I added some debugging to test for the cases when clearing all TLBs is overkill, and in practice it happens very seldom. Reported-by: Jan Beulich jbeul...@suse.com Signed-off-by: Alex Shi alex@intel.com Signed-off-by: Cliff Wickman c...@sgi.com Cc: Ingo Molnar mi...@elte.hu Cc: Thomas Gleixner t...@linutronix.de Cc: H. Peter Anvin h...@zytor.com --- arch/x86/include/asm/uv/uv.h |2 +- arch/x86/platform/uv/tlb_uv.c | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) Index: linux/arch/x86/platform/uv/tlb_uv.c === --- linux.orig/arch/x86/platform/uv/tlb_uv.c +++ linux/arch/x86/platform/uv/tlb_uv.c @@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, unsigned int cpu) + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc-payload.address = start; + if (!end || (end - start) = PAGE_SIZE) + bau_desc-payload.address = start; + else + bau_desc-payload.address = TLB_FLUSH_ALL; bau_desc-payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, Index: linux/arch/x86/include/asm/uv/uv.h === --- linux.orig/arch/x86/include/asm/uv/uv.h +++ linux/arch/x86/include/asm/uv/uv.h @@ -16,7 +16,7 @@ extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, -unsigned end, +unsigned long end, unsigned int cpu); #else /* X86_UV */ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [alex....@intel.com: Re: [PATCH] UV: fix incorrect tlb flush all issue]
On Thu, Sep 13, 2012 at 05:53:10PM +0200, Ingo Molnar wrote: > > Ack? > > Thanks, > > Ingo Ack. But with the adjustment below. The 'end' argument was not declared long. I tested the patch on a UV. It has the effect of either clearing 1 or all TLBs in a cpu. I added some debugging to test for the cases when clearing all TLBs is overkill, and in practice it happens very seldom. Sorry I didn't participate in this patch earlier. Jack Steiner was copied, I believe. But stei...@sgi.com is no longer active. Jack has retired -- congratulations to him, but a very big loss to us, both professionally and personally. -Cliff Reported-by: Jan Beulich Signed-off-by: Alex Shi Acked-by: Cliff Wickman Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/uv/uv.h |2 +- arch/x86/platform/uv/tlb_uv.c | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) Index: linux/arch/x86/platform/uv/tlb_uv.c === --- linux.orig/arch/x86/platform/uv/tlb_uv.c +++ linux/arch/x86/platform/uv/tlb_uv.c @@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, unsigned int cpu) + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc->payload.address = start; + if (!end || (end - start) <= PAGE_SIZE) + bau_desc->payload.address = start; + else + bau_desc->payload.address = TLB_FLUSH_ALL; bau_desc->payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, Index: linux/arch/x86/include/asm/uv/uv.h === --- linux.orig/arch/x86/include/asm/uv/uv.h +++ linux/arch/x86/include/asm/uv/uv.h @@ -16,7 +16,7 @@ extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, -unsigned end, +unsigned long end, unsigned int cpu); #else /* X86_UV */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [alex....@intel.com: Re: [PATCH] UV: fix incorrect tlb flush all issue]
On Thu, Sep 13, 2012 at 05:53:10PM +0200, Ingo Molnar wrote: Ack? Thanks, Ingo Ack. But with the adjustment below. The 'end' argument was not declared long. I tested the patch on a UV. It has the effect of either clearing 1 or all TLBs in a cpu. I added some debugging to test for the cases when clearing all TLBs is overkill, and in practice it happens very seldom. Sorry I didn't participate in this patch earlier. Jack Steiner was copied, I believe. But stei...@sgi.com is no longer active. Jack has retired -- congratulations to him, but a very big loss to us, both professionally and personally. -Cliff Reported-by: Jan Beulich jbeul...@suse.com Signed-off-by: Alex Shi alex@intel.com Acked-by: Cliff Wickman c...@sgi.com Cc: Ingo Molnar mi...@elte.hu Cc: Thomas Gleixner t...@linutronix.de Cc: H. Peter Anvin h...@zytor.com --- arch/x86/include/asm/uv/uv.h |2 +- arch/x86/platform/uv/tlb_uv.c | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) Index: linux/arch/x86/platform/uv/tlb_uv.c === --- linux.orig/arch/x86/platform/uv/tlb_uv.c +++ linux/arch/x86/platform/uv/tlb_uv.c @@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpuma * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpuma */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, unsigned int cpu) + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_other record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc-payload.address = start; + if (!end || (end - start) = PAGE_SIZE) + bau_desc-payload.address = start; + else + bau_desc-payload.address = TLB_FLUSH_ALL; bau_desc-payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, Index: linux/arch/x86/include/asm/uv/uv.h === --- linux.orig/arch/x86/include/asm/uv/uv.h +++ linux/arch/x86/include/asm/uv/uv.h @@ -16,7 +16,7 @@ extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, -unsigned end, +unsigned long end, unsigned int cpu); #else /* X86_UV */ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/4 v2] cpusets: update_cpumask documentation fix
Update cpuset documentation to match the October 2007 "Fix cpusets update_cpumask" changes that now apply changes to a cpusets 'cpus' allowed mask immediately to the cpus_allowed of the tasks in that cpuset. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- Documentation/cpusets.txt | 23 --- 1 file changed, 8 insertions(+), 15 deletions(-) Index: linux-2.6/Documentation/cpusets.txt === --- linux-2.6.orig/Documentation/cpusets.txt +++ linux-2.6/Documentation/cpusets.txt @@ -523,21 +523,14 @@ from one cpuset to another, then the ker memory placement, as above, the next time that the kernel attempts to allocate a page of memory for that task. -If a cpuset has its CPUs modified, then each task using that -cpuset does _not_ change its behavior automatically. In order to -minimize the impact on the critical scheduling code in the kernel, -tasks will continue to use their prior CPU placement until they -are rebound to their cpuset, by rewriting their pid to the 'tasks' -file of their cpuset. If a task had been bound to some subset of its -cpuset using the sched_setaffinity() call, and if any of that subset -is still allowed in its new cpuset settings, then the task will be -restricted to the intersection of the CPUs it was allowed on before, -and its new cpuset CPU placement. If, on the other hand, there is -no overlap between a tasks prior placement and its new cpuset CPU -placement, then the task will be allowed to run on any CPU allowed -in its new cpuset. If a task is moved from one cpuset to another, -its CPU placement is updated in the same way as if the tasks pid is -rewritten to the 'tasks' file of its current cpuset. +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. In summary, the memory placement of a task whose cpuset is changed is updated by the kernel, on the next allocation of a page for that task, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/4 v2] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1709,7 +1709,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed)) + while (cpus_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) parent = parent->parent; move_member_tasks_to_cpuset(cs, parent); @@ -1741,7 +1742,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)>stack_list, ); - mutex_lock(_mutex); while (!list_empty()) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1750,19 +1750,24 @@ static void scan_for_empty_cpusets(const list_add_tail(>stack_list, ); } cont = cp->css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp->cpus_allowed) || -nodes_empty(cp->mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(_mutex); +nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(_mutex); - } } - mutex_unlock(_mutex); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/4 v2] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had "iff", meaning "if and only if" in a comment. However, except for ancient mathematicians, the abbreviation "iff" was a tad too cryptic. Cliff changed it to "if", presumably figuring that the "iff" was a typo. However, it was the "only if" half of the conjunction that was most interesting. Reword to emphasis the "only if" aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, >flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 41 +++-- 1 file changed, 15 insertions(+), 26 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -752,7 +752,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -809,7 +809,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call holding cgroup_mutex, so current's cpuset won't change - *during this call, as cgroup_mutex holds off any attach_task() + *during this call, as manage_mutex holds off any cpuset_attach() *calls. Therefore we don't need to take task_lock around the *call to guarantee_online_mems(), as we know no one is changing *our task's cpuset. @@ -1661,8 +1661,8 @@ void cpuset_do_move_task(struct task_str * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with manage_sem held - * callback_mutex must not be held, as attach_task() will take it. + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1689,18 +1689,18 @@ static void move_member_tasks_to_cpuset( * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. * - * The parent cpuset has some superset of the 'mems' nodes that the - * newly empty cpuset held, so no migration of memory is necessary. - * - * Called with both manage_sem and callback_sem held + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; - /* the cgroup's css_sets list is in use if there are tasks - in the cpuset; the list is empty if there are none; - the cs->css.refcnt seems always 0 */ + /* +* The cgroup's css_sets list is in use if there are tasks +* in the cpuset; the list is empty if there are none; +* the cs->css.refcnt seems always 0. +*/ if (list_empty(>css.cgroup->css_sets)) return; @@ -1709,14 +1709,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed)) { - /* -* this empty cpuset should now be considered to -* have been used, and therefore eligible for -* release when empty (if it is notify_on_release) -*/ + while (cpus_empty(parent->cpus_allowed)) parent = parent->parent; - } move_member_tasks_to_cpuset(cs, parent); } @@ -1725,10 +1719,6 @@ static void remove_tasks_in_empty_cpuset * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpus
[PATCH 1/4 v2] hotplug cpu move tasks in empty cpusets to parent node_online_map fix
As of the October 2007 kernel/cpuset.c patch "Memoryless nodes: Use N_HIGH_MEMORY for cpusets", cpuset nodes are relative to the nodes with (HIGH) memory, not relative to all nodes in node_online_map. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1762,7 +1762,8 @@ static void scan_for_empty_cpusets(const cont = cp->css.cgroup; /* Remove offline cpus and mems from this cpuset. */ cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); - nodes_and(cp->mems_allowed, cp->mems_allowed, node_online_map); + nodes_and(cp->mems_allowed, cp->mems_allowed, + node_states[N_HIGH_MEMORY]); if ((cpus_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed))) { /* Move tasks from the empty cpuset to a parent */ @@ -1777,8 +1778,8 @@ static void scan_for_empty_cpusets(const /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_online_map. Force the top cpuset to track - * whats online after any CPU or memory node hotplug or unplug event. + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/4 v2] cpusets: update_cpumask documentation fix
Update cpuset documentation to match the October 2007 Fix cpusets update_cpumask changes that now apply changes to a cpusets 'cpus' allowed mask immediately to the cpus_allowed of the tasks in that cpuset. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- Documentation/cpusets.txt | 23 --- 1 file changed, 8 insertions(+), 15 deletions(-) Index: linux-2.6/Documentation/cpusets.txt === --- linux-2.6.orig/Documentation/cpusets.txt +++ linux-2.6/Documentation/cpusets.txt @@ -523,21 +523,14 @@ from one cpuset to another, then the ker memory placement, as above, the next time that the kernel attempts to allocate a page of memory for that task. -If a cpuset has its CPUs modified, then each task using that -cpuset does _not_ change its behavior automatically. In order to -minimize the impact on the critical scheduling code in the kernel, -tasks will continue to use their prior CPU placement until they -are rebound to their cpuset, by rewriting their pid to the 'tasks' -file of their cpuset. If a task had been bound to some subset of its -cpuset using the sched_setaffinity() call, and if any of that subset -is still allowed in its new cpuset settings, then the task will be -restricted to the intersection of the CPUs it was allowed on before, -and its new cpuset CPU placement. If, on the other hand, there is -no overlap between a tasks prior placement and its new cpuset CPU -placement, then the task will be allowed to run on any CPU allowed -in its new cpuset. If a task is moved from one cpuset to another, -its CPU placement is updated in the same way as if the tasks pid is -rewritten to the 'tasks' file of its current cpuset. +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. In summary, the memory placement of a task whose cpuset is changed is updated by the kernel, on the next allocation of a page for that task, -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/4 v2] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had iff, meaning if and only if in a comment. However, except for ancient mathematicians, the abbreviation iff was a tad too cryptic. Cliff changed it to if, presumably figuring that the iff was a typo. However, it was the only if half of the conjunction that was most interesting. Reword to emphasis the only if aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, parent-flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 41 +++-- 1 file changed, 15 insertions(+), 26 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -752,7 +752,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -809,7 +809,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call holding cgroup_mutex, so current's cpuset won't change - *during this call, as cgroup_mutex holds off any attach_task() + *during this call, as manage_mutex holds off any cpuset_attach() *calls. Therefore we don't need to take task_lock around the *call to guarantee_online_mems(), as we know no one is changing *our task's cpuset. @@ -1661,8 +1661,8 @@ void cpuset_do_move_task(struct task_str * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with manage_sem held - * callback_mutex must not be held, as attach_task() will take it. + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1689,18 +1689,18 @@ static void move_member_tasks_to_cpuset( * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. * - * The parent cpuset has some superset of the 'mems' nodes that the - * newly empty cpuset held, so no migration of memory is necessary. - * - * Called with both manage_sem and callback_sem held + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; - /* the cgroup's css_sets list is in use if there are tasks - in the cpuset; the list is empty if there are none; - the cs-css.refcnt seems always 0 */ + /* +* The cgroup's css_sets list is in use if there are tasks +* in the cpuset; the list is empty if there are none; +* the cs-css.refcnt seems always 0. +*/ if (list_empty(cs-css.cgroup-css_sets)) return; @@ -1709,14 +1709,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs-parent; - while (cpus_empty(parent-cpus_allowed)) { - /* -* this empty cpuset should now be considered to -* have been used, and therefore eligible for -* release when empty (if it is notify_on_release) -*/ + while (cpus_empty(parent-cpus_allowed)) parent = parent-parent; - } move_member_tasks_to_cpuset(cs, parent); } @@ -1725,10 +1719,6 @@ static void remove_tasks_in_empty_cpuset * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpuset must be moved to a parent cpuset. * - * Note that such a notify_on_release cpuset must have had, at some time, - * member
[PATCH 1/4 v2] hotplug cpu move tasks in empty cpusets to parent node_online_map fix
As of the October 2007 kernel/cpuset.c patch Memoryless nodes: Use N_HIGH_MEMORY for cpusets, cpuset nodes are relative to the nodes with (HIGH) memory, not relative to all nodes in node_online_map. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1762,7 +1762,8 @@ static void scan_for_empty_cpusets(const cont = cp-css.cgroup; /* Remove offline cpus and mems from this cpuset. */ cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); - nodes_and(cp-mems_allowed, cp-mems_allowed, node_online_map); + nodes_and(cp-mems_allowed, cp-mems_allowed, + node_states[N_HIGH_MEMORY]); if ((cpus_empty(cp-cpus_allowed) || nodes_empty(cp-mems_allowed))) { /* Move tasks from the empty cpuset to a parent */ @@ -1777,8 +1778,8 @@ static void scan_for_empty_cpusets(const /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_online_map. Force the top cpuset to track - * whats online after any CPU or memory node hotplug or unplug event. + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/4 v2] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1709,7 +1709,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs-parent; - while (cpus_empty(parent-cpus_allowed)) + while (cpus_empty(parent-cpus_allowed) || + nodes_empty(parent-mems_allowed)) parent = parent-parent; move_member_tasks_to_cpuset(cs, parent); @@ -1741,7 +1742,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)root-stack_list, queue); - mutex_lock(callback_mutex); while (!list_empty(queue)) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1750,19 +1750,24 @@ static void scan_for_empty_cpusets(const list_add_tail(child-stack_list, queue); } cont = cp-css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp-cpus_allowed, cpu_online_map) + nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(callback_mutex); cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); nodes_and(cp-mems_allowed, cp-mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp-cpus_allowed) || -nodes_empty(cp-mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(callback_mutex); +nodes_empty(cp-mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(callback_mutex); - } } - mutex_unlock(callback_mutex); } /* -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/4] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed)) + while (cpus_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) parent = parent->parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)>stack_list, ); - mutex_lock(_mutex); while (!list_empty()) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(>stack_list, ); } cont = cp->css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp->cpus_allowed) || -nodes_empty(cp->mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(_mutex); +nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(_mutex); - } } - mutex_unlock(_mutex); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/4] hotplug cpu move tasks in empty cpusets to parent node_online_map fix
As of the October 2007 kernel/cpuset.c patch "Memoryless nodes: Use N_HIGH_MEMORY for cpusets", cpuset nodes are relative to the nodes with (HIGH) memory, not relative to all nodes in node_online_map. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1801,7 +1801,8 @@ static void scan_for_empty_cpusets(const cont = cp->css.cgroup; /* Remove offline cpus and mems from this cpuset. */ cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); - nodes_and(cp->mems_allowed, cp->mems_allowed, node_online_map); + nodes_and(cp->mems_allowed, cp->mems_allowed, + node_states[N_HIGH_MEMORY]); if ((cpus_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed))) { /* Move tasks from the empty cpuset to a parent */ @@ -1816,8 +1817,8 @@ static void scan_for_empty_cpusets(const /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_online_map. Force the top cpuset to track - * whats online after any CPU or memory node hotplug or unplug event. + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/4] cpusets: update_cpumask documentation fix
Update cpuset documentation to match the October 2007 "Fix cpusets update_cpumask" changes that now apply changes to a cpusets 'cpus' allowed mask immediately to the cpus_allowed of the tasks in that cpuset. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- Documentation/cpusets.txt | 23 --- 1 file changed, 8 insertions(+), 15 deletions(-) Index: linux-2.6/Documentation/cpusets.txt === --- linux-2.6.orig/Documentation/cpusets.txt +++ linux-2.6/Documentation/cpusets.txt @@ -523,21 +523,14 @@ from one cpuset to another, then the ker memory placement, as above, the next time that the kernel attempts to allocate a page of memory for that task. -If a cpuset has its CPUs modified, then each task using that -cpuset does _not_ change its behavior automatically. In order to -minimize the impact on the critical scheduling code in the kernel, -tasks will continue to use their prior CPU placement until they -are rebound to their cpuset, by rewriting their pid to the 'tasks' -file of their cpuset. If a task had been bound to some subset of its -cpuset using the sched_setaffinity() call, and if any of that subset -is still allowed in its new cpuset settings, then the task will be -restricted to the intersection of the CPUs it was allowed on before, -and its new cpuset CPU placement. If, on the other hand, there is -no overlap between a tasks prior placement and its new cpuset CPU -placement, then the task will be allowed to run on any CPU allowed -in its new cpuset. If a task is moved from one cpuset to another, -its CPU placement is updated in the same way as if the tasks pid is -rewritten to the 'tasks' file of its current cpuset. +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. In summary, the memory placement of a task whose cpuset is changed is updated by the kernel, on the next allocation of a page for that task, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[4/4] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed)) + while (cpus_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) parent = parent->parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)>stack_list, ); - mutex_lock(_mutex); while (!list_empty()) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(>stack_list, ); } cont = cp->css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp->cpus_allowed) || -nodes_empty(cp->mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(_mutex); +nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(_mutex); - } } - mutex_unlock(_mutex); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had "iff", meaning "if and only if" in a comment. However, except for ancient mathematicians, the abbreviation "iff" was a tad too cryptic. Cliff changed it to "if", presumably figuring that the "iff" was a typo. However, it was the "only if" half of the conjunction that was most interesting. Reword to emphasis the "only if" aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, >flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 53 + 1 file changed, 21 insertions(+), 32 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -167,7 +167,7 @@ static inline int is_spread_slab(const s * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * - * A single, global generation is needed because attach_task() could + * A single, global generation is needed because cpuset_attach() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * @@ -218,7 +218,7 @@ static struct cpuset top_cpuset = { * Any task can increment and decrement the count field without lock. * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both mutexes, can + * zero, then only cpuset_attach(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). @@ -255,18 +255,18 @@ static struct cpuset top_cpuset = { * * The task_lock() exception * - * The need for this exception arises from the action of attach_task(), + * The need for this exception arises from the action of cpuset_attach(), * which overwrites one tasks cpuset pointer with another. It does * so using both mutexes, however there are several performance * critical places that need to reference task->cpuset without the * expense of grabbing a system global mutex. Therefore except as - * noted below, when dereferencing or, as in attach_task(), modifying + * noted below, when dereferencing or, as in cpuset_attach(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. * * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cpuset pointer by attach_task() and the + * update of a tasks cpuset pointer by cpuset_attach() and the * access of task->cpuset->mems_generation via that pointer in * the routine cpuset_update_task_memory_state(). */ @@ -368,7 +368,7 @@ static void guarantee_online_mems(const * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset by attach_task(), + * from concurrent freeing of current->cpuset by cpuset_attach(), * using RCU. * * The rcu_dereference() is technically probably not needed, @@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call
[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had "iff", meaning "if and only if" in a comment. However, except for ancient mathematicians, the abbreviation "iff" was a tad too cryptic. Cliff changed it to "if", presumably figuring that the "iff" was a typo. However, it was the "only if" half of the conjunction that was most interesting. Reword to emphasis the "only if" aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, >flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> Acked-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 53 + 1 file changed, 21 insertions(+), 32 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -167,7 +167,7 @@ static inline int is_spread_slab(const s * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * - * A single, global generation is needed because attach_task() could + * A single, global generation is needed because cpuset_attach() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * @@ -218,7 +218,7 @@ static struct cpuset top_cpuset = { * Any task can increment and decrement the count field without lock. * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both mutexes, can + * zero, then only cpuset_attach(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). @@ -255,18 +255,18 @@ static struct cpuset top_cpuset = { * * The task_lock() exception * - * The need for this exception arises from the action of attach_task(), + * The need for this exception arises from the action of cpuset_attach(), * which overwrites one tasks cpuset pointer with another. It does * so using both mutexes, however there are several performance * critical places that need to reference task->cpuset without the * expense of grabbing a system global mutex. Therefore except as - * noted below, when dereferencing or, as in attach_task(), modifying + * noted below, when dereferencing or, as in cpuset_attach(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. * * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cpuset pointer by attach_task() and the + * update of a tasks cpuset pointer by cpuset_attach() and the * access of task->cpuset->mems_generation via that pointer in * the routine cpuset_update_task_memory_state(). */ @@ -368,7 +368,7 @@ static void guarantee_online_mems(const * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset by attach_task(), + * from concurrent freeing of current->cpuset by cpuset_attach(), * using RCU. * * The rcu_dereference() is technically probably not needed, @@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call
[PATCH 1/4] hotplug cpu move tasks in empty cpusets to parent node_online_map fix
As of the October 2007 kernel/cpuset.c patch Memoryless nodes: Use N_HIGH_MEMORY for cpusets, cpuset nodes are relative to the nodes with (HIGH) memory, not relative to all nodes in node_online_map. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1801,7 +1801,8 @@ static void scan_for_empty_cpusets(const cont = cp-css.cgroup; /* Remove offline cpus and mems from this cpuset. */ cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); - nodes_and(cp-mems_allowed, cp-mems_allowed, node_online_map); + nodes_and(cp-mems_allowed, cp-mems_allowed, + node_states[N_HIGH_MEMORY]); if ((cpus_empty(cp-cpus_allowed) || nodes_empty(cp-mems_allowed))) { /* Move tasks from the empty cpuset to a parent */ @@ -1816,8 +1817,8 @@ static void scan_for_empty_cpusets(const /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_online_map. Force the top cpuset to track - * whats online after any CPU or memory node hotplug or unplug event. + * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/4] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs-parent; - while (cpus_empty(parent-cpus_allowed)) + while (cpus_empty(parent-cpus_allowed) || + nodes_empty(parent-mems_allowed)) parent = parent-parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)root-stack_list, queue); - mutex_lock(callback_mutex); while (!list_empty(queue)) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(child-stack_list, queue); } cont = cp-css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp-cpus_allowed, cpu_online_map) + nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(callback_mutex); cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); nodes_and(cp-mems_allowed, cp-mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp-cpus_allowed) || -nodes_empty(cp-mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(callback_mutex); +nodes_empty(cp-mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(callback_mutex); - } } - mutex_unlock(callback_mutex); } /* -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/4] cpusets: update_cpumask documentation fix
Update cpuset documentation to match the October 2007 Fix cpusets update_cpumask changes that now apply changes to a cpusets 'cpus' allowed mask immediately to the cpus_allowed of the tasks in that cpuset. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- Documentation/cpusets.txt | 23 --- 1 file changed, 8 insertions(+), 15 deletions(-) Index: linux-2.6/Documentation/cpusets.txt === --- linux-2.6.orig/Documentation/cpusets.txt +++ linux-2.6/Documentation/cpusets.txt @@ -523,21 +523,14 @@ from one cpuset to another, then the ker memory placement, as above, the next time that the kernel attempts to allocate a page of memory for that task. -If a cpuset has its CPUs modified, then each task using that -cpuset does _not_ change its behavior automatically. In order to -minimize the impact on the critical scheduling code in the kernel, -tasks will continue to use their prior CPU placement until they -are rebound to their cpuset, by rewriting their pid to the 'tasks' -file of their cpuset. If a task had been bound to some subset of its -cpuset using the sched_setaffinity() call, and if any of that subset -is still allowed in its new cpuset settings, then the task will be -restricted to the intersection of the CPUs it was allowed on before, -and its new cpuset CPU placement. If, on the other hand, there is -no overlap between a tasks prior placement and its new cpuset CPU -placement, then the task will be allowed to run on any CPU allowed -in its new cpuset. If a task is moved from one cpuset to another, -its CPU placement is updated in the same way as if the tasks pid is -rewritten to the 'tasks' file of its current cpuset. +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. In summary, the memory placement of a task whose cpuset is changed is updated by the kernel, on the next allocation of a page for that task, -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[4/4] hotplug cpu move tasks in empty cpusets - refinements
Narrow the scope of callback_mutex in scan_for_empty_cpusets(). Avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them. Have remove_tasks_in_empty_cpuset() also check for empty mems. Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs-parent; - while (cpus_empty(parent-cpus_allowed)) + while (cpus_empty(parent-cpus_allowed) || + nodes_empty(parent-mems_allowed)) parent = parent-parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)root-stack_list, queue); - mutex_lock(callback_mutex); while (!list_empty(queue)) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(child-stack_list, queue); } cont = cp-css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp-cpus_allowed, cpu_online_map) + nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(callback_mutex); cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); nodes_and(cp-mems_allowed, cp-mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp-cpus_allowed) || -nodes_empty(cp-mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(callback_mutex); +nodes_empty(cp-mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(callback_mutex); - } } - mutex_unlock(callback_mutex); } /* -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had iff, meaning if and only if in a comment. However, except for ancient mathematicians, the abbreviation iff was a tad too cryptic. Cliff changed it to if, presumably figuring that the iff was a typo. However, it was the only if half of the conjunction that was most interesting. Reword to emphasis the only if aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, parent-flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 53 + 1 file changed, 21 insertions(+), 32 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -167,7 +167,7 @@ static inline int is_spread_slab(const s * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * - * A single, global generation is needed because attach_task() could + * A single, global generation is needed because cpuset_attach() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * @@ -218,7 +218,7 @@ static struct cpuset top_cpuset = { * Any task can increment and decrement the count field without lock. * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both mutexes, can + * zero, then only cpuset_attach(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). @@ -255,18 +255,18 @@ static struct cpuset top_cpuset = { * * The task_lock() exception * - * The need for this exception arises from the action of attach_task(), + * The need for this exception arises from the action of cpuset_attach(), * which overwrites one tasks cpuset pointer with another. It does * so using both mutexes, however there are several performance * critical places that need to reference task-cpuset without the * expense of grabbing a system global mutex. Therefore except as - * noted below, when dereferencing or, as in attach_task(), modifying + * noted below, when dereferencing or, as in cpuset_attach(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task-alloc_lock) already in the task_struct routinely used for * such matters. * * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cpuset pointer by attach_task() and the + * update of a tasks cpuset pointer by cpuset_attach() and the * access of task-cpuset-mems_generation via that pointer in * the routine cpuset_update_task_memory_state(). */ @@ -368,7 +368,7 @@ static void guarantee_online_mems(const * * Reading current-cpuset-mems_generation doesn't need task_lock * to guard the current-cpuset derefence, because it is guarded - * from concurrent freeing of current-cpuset by attach_task(), + * from concurrent freeing of current-cpuset by cpuset_attach(), * using RCU. * * The rcu_dereference() is technically probably not needed, @@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call holding manage_mutex, so our current-cpuset won't change - *during this call, as manage_mutex holds off any attach_task
[PATCH 2/4] hotplug cpu move tasks in empty cpusets to parent various other fixes
Various minor formatting and comment tweaks to Cliff Wickman's [PATCH_3_of_3]_cpusets__update_cpumask_revision.patch I had had iff, meaning if and only if in a comment. However, except for ancient mathematicians, the abbreviation iff was a tad too cryptic. Cliff changed it to if, presumably figuring that the iff was a typo. However, it was the only if half of the conjunction that was most interesting. Reword to emphasis the only if aspect. The locking comment for remove_tasks_in_empty_cpuset() was wrong; it said callback_mutex had to be held on entry. The opposite is true. Several mentions of attach_task() in comments needed to be changed to cgroup_attach_task(). A comment about notify_on_release was no longer relevant, as the line of code it had commented, namely: set_bit(CS_RELEASED_RESOURCE, parent-flags); is no longer present in that place in the cpuset.c code. Similarly a comment about notify_on_release before the scan_for_empty_cpusets() routine was no longer relevant. Removed extra parentheses and unnecessary return statement. Renamed attach_task() to cpuset_attach() in various comments. Removed comment about not needing memory migration, as it seems the migration is done anyway, via the cpuset_attach() callback from cgroup_attach_task(). Signed-off-by: Paul Jackson [EMAIL PROTECTED] Acked-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 53 + 1 file changed, 21 insertions(+), 32 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -167,7 +167,7 @@ static inline int is_spread_slab(const s * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * - * A single, global generation is needed because attach_task() could + * A single, global generation is needed because cpuset_attach() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * @@ -218,7 +218,7 @@ static struct cpuset top_cpuset = { * Any task can increment and decrement the count field without lock. * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both mutexes, can + * zero, then only cpuset_attach(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). @@ -255,18 +255,18 @@ static struct cpuset top_cpuset = { * * The task_lock() exception * - * The need for this exception arises from the action of attach_task(), + * The need for this exception arises from the action of cpuset_attach(), * which overwrites one tasks cpuset pointer with another. It does * so using both mutexes, however there are several performance * critical places that need to reference task-cpuset without the * expense of grabbing a system global mutex. Therefore except as - * noted below, when dereferencing or, as in attach_task(), modifying + * noted below, when dereferencing or, as in cpuset_attach(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task-alloc_lock) already in the task_struct routinely used for * such matters. * * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cpuset pointer by attach_task() and the + * update of a tasks cpuset pointer by cpuset_attach() and the * access of task-cpuset-mems_generation via that pointer in * the routine cpuset_update_task_memory_state(). */ @@ -368,7 +368,7 @@ static void guarantee_online_mems(const * * Reading current-cpuset-mems_generation doesn't need task_lock * to guard the current-cpuset derefence, because it is guarded - * from concurrent freeing of current-cpuset by attach_task(), + * from concurrent freeing of current-cpuset by cpuset_attach(), * using RCU. * * The rcu_dereference() is technically probably not needed, @@ -790,7 +790,7 @@ static int update_cpumask(struct cpuset trialcs = *cs; /* -* An empty cpus_allowed is ok if there are no tasks in the cpuset. +* An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. @@ -847,7 +847,7 @@ static int update_cpumask(struct cpuset *so that the migration code can allocate pages on these nodes. * *Call holding manage_mutex, so our current-cpuset won't change - *during this call, as manage_mutex holds off any attach_task
[RFC] hotplug cpu move tasks in empty cpusets - possible refinements
Hi Paul, > Query for Cliff: > 1) Can we narrow the scope of callback_mutex in scan_for_empty_cpusets()? > 2) Can we avoid rewriting the cpus, mems of cpusets except when it is >likely that we'll be changing them? > 3) Should not remove_tasks_in_empty_cpuset() also check for empty mems? > -pj I agree with all of the above refinements. And I just tested the below patch and find no problem. So this is an ACK from me. -Cliff --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed)) + while (cpus_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) parent = parent->parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)>stack_list, ); - mutex_lock(_mutex); while (!list_empty()) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(>stack_list, ); } cont = cp->css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp->cpus_allowed) || -nodes_empty(cp->mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(_mutex); +nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(_mutex); - } } - mutex_unlock(_mutex); } /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC] hotplug cpu move tasks in empty cpusets - possible refinements
Hi Paul, Query for Cliff: 1) Can we narrow the scope of callback_mutex in scan_for_empty_cpusets()? 2) Can we avoid rewriting the cpus, mems of cpusets except when it is likely that we'll be changing them? 3) Should not remove_tasks_in_empty_cpuset() also check for empty mems? -pj I agree with all of the above refinements. And I just tested the below patch and find no problem. So this is an ACK from me. -Cliff --- kernel/cpuset.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/cpuset.c === --- linux-2.6.orig/kernel/cpuset.c +++ linux-2.6/kernel/cpuset.c @@ -1748,7 +1748,8 @@ static void remove_tasks_in_empty_cpuset * has online cpus, so can't be empty). */ parent = cs-parent; - while (cpus_empty(parent-cpus_allowed)) + while (cpus_empty(parent-cpus_allowed) || + nodes_empty(parent-mems_allowed)) parent = parent-parent; move_member_tasks_to_cpuset(cs, parent); @@ -1780,7 +1781,6 @@ static void scan_for_empty_cpusets(const list_add_tail((struct list_head *)root-stack_list, queue); - mutex_lock(callback_mutex); while (!list_empty(queue)) { cp = container_of(queue.next, struct cpuset, stack_list); list_del(queue.next); @@ -1789,19 +1789,24 @@ static void scan_for_empty_cpusets(const list_add_tail(child-stack_list, queue); } cont = cp-css.cgroup; + + /* Continue past cpusets with all cpus, mems online */ + if (cpus_subset(cp-cpus_allowed, cpu_online_map) + nodes_subset(cp-mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(callback_mutex); cpus_and(cp-cpus_allowed, cp-cpus_allowed, cpu_online_map); nodes_and(cp-mems_allowed, cp-mems_allowed, node_states[N_HIGH_MEMORY]); + mutex_unlock(callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp-cpus_allowed) || -nodes_empty(cp-mems_allowed)) { - /* Move tasks from the empty cpuset to a parent */ - mutex_unlock(callback_mutex); +nodes_empty(cp-mems_allowed)) remove_tasks_in_empty_cpuset(cp); - mutex_lock(callback_mutex); - } } - mutex_unlock(callback_mutex); } /* -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH][VER 4] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for (yet more) revision. sorry. This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch Version 4: clear/release fetchop pages only when vma_data is no longer shared The vma_data structure may be shared by vma's from multiple tasks, with no way of knowing which areas are shared or not shared, so release/clear pages only when the refcount (of vma's) goes to zero. Diffed against 2.6.23-rc7 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- drivers/char/mspec.c | 26 -- 1 file changed, 8 insertions(+), 18 deletions(-) Index: linus.070920/drivers/char/mspec.c === --- linus.070920.orig/drivers/char/mspec.c +++ linus.070920/drivers/char/mspec.c @@ -155,23 +155,22 @@ mspec_open(struct vm_area_struct *vma) * mspec_close * * Called when unmapping a device mapping. Frees all mspec pages - * belonging to the vma. + * belonging to all the vma's sharing this vma_data structure. */ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int index, last_index, result; + int index, last_index; unsigned long my_page; vdata = vma->vm_private_data; - BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end); + if (!atomic_dec_and_test(>refcnt)) + return; - spin_lock(>lock); - index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT; - last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT; - for (; index < last_index; index++) { + last_index = (vdata->vm_end - vdata->vm_start) >> PAGE_SHIFT; + for (index=0; index < last_index; index++) { if (vdata->maddr[index] == 0) continue; /* @@ -180,20 +179,12 @@ mspec_close(struct vm_area_struct *vma) */ my_page = vdata->maddr[index]; vdata->maddr[index] = 0; - spin_unlock(>lock); - result = mspec_zero_block(my_page, PAGE_SIZE); - if (!result) + if (!mspec_zero_block(my_page, PAGE_SIZE)) uncached_free_page(my_page); else printk(KERN_WARNING "mspec_close(): " - "failed to zero page %i\n", - result); - spin_lock(>lock); + "failed to zero page %ld\n", my_page); } - spin_unlock(>lock); - - if (!atomic_dec_and_test(>refcnt)) - return; if (vdata->flags & VMD_VMALLOCED) vfree(vdata); @@ -201,7 +192,6 @@ mspec_close(struct vm_area_struct *vma) kfree(vdata); } - /* * mspec_nopfn * - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH][VER 4] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for more revision. This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch Version 4: clear/release fetchop pages only when vma_data is no longer shared Version 3: single thread the clearing of vma_data maddr[] Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm->mmap_sem in the fork() case so the reference count is left as atomic_t. The vma_data structure may be shared by vma's from multiple tasks, with no way of knowing which areas are shared or not shared, so release/clear pages only when the refcount (of vma's) goes to zero. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> Acked-by: Jes Sorensen <[EMAIL PROTECTED]> --- drivers/char/mspec.c | 64 --- 1 file changed, 41 insertions(+), 23 deletions(-) Index: linus.070912/drivers/char/mspec.c === --- linus.070912.orig/drivers/char/mspec.c +++ linus.070912/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma->vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm->mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ - spinlock_t lock;/* Serialize access to the vma. */ + spinlock_t lock;/* Serialize access to this structure. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -145,40 +155,41 @@ mspec_open(struct vm_area_struct *vma) * mspec_close * * Called when unmapping a device mapping. Frees all mspec pages - * belonging to the vma. + * belonging to all the vma's sharing this vma_data structure. */ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index; + unsigned long my_page; vdata = vma->vm_private_data; + if (!atomic_dec_and_test(>refcnt)) return; - pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - vdata
[PATCH][VER 4] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for more revision. This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch Version 4: clear/release fetchop pages only when vma_data is no longer shared Version 3: single thread the clearing of vma_data maddr[] Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm-mmap_sem in the fork() case so the reference count is left as atomic_t. The vma_data structure may be shared by vma's from multiple tasks, with no way of knowing which areas are shared or not shared, so release/clear pages only when the refcount (of vma's) goes to zero. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] Acked-by: Jes Sorensen [EMAIL PROTECTED] --- drivers/char/mspec.c | 64 --- 1 file changed, 41 insertions(+), 23 deletions(-) Index: linus.070912/drivers/char/mspec.c === --- linus.070912.orig/drivers/char/mspec.c +++ linus.070912/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma-vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm-mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ - spinlock_t lock;/* Serialize access to the vma. */ + spinlock_t lock;/* Serialize access to this structure. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -145,40 +155,41 @@ mspec_open(struct vm_area_struct *vma) * mspec_close * * Called when unmapping a device mapping. Frees all mspec pages - * belonging to the vma. + * belonging to all the vma's sharing this vma_data structure. */ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index; + unsigned long my_page; vdata = vma-vm_private_data; + if (!atomic_dec_and_test(vdata-refcnt)) return; - pages = (vma-vm_end - vma-vm_start) PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages
[PATCH][VER 4] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for (yet more) revision. sorry. This is a revision of Andrew's mspec-handle-shrinking-virtual-memory-areas.patch Version 4: clear/release fetchop pages only when vma_data is no longer shared The vma_data structure may be shared by vma's from multiple tasks, with no way of knowing which areas are shared or not shared, so release/clear pages only when the refcount (of vma's) goes to zero. Diffed against 2.6.23-rc7 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- drivers/char/mspec.c | 26 -- 1 file changed, 8 insertions(+), 18 deletions(-) Index: linus.070920/drivers/char/mspec.c === --- linus.070920.orig/drivers/char/mspec.c +++ linus.070920/drivers/char/mspec.c @@ -155,23 +155,22 @@ mspec_open(struct vm_area_struct *vma) * mspec_close * * Called when unmapping a device mapping. Frees all mspec pages - * belonging to the vma. + * belonging to all the vma's sharing this vma_data structure. */ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int index, last_index, result; + int index, last_index; unsigned long my_page; vdata = vma-vm_private_data; - BUG_ON(vma-vm_start vdata-vm_start || vma-vm_end vdata-vm_end); + if (!atomic_dec_and_test(vdata-refcnt)) + return; - spin_lock(vdata-lock); - index = (vma-vm_start - vdata-vm_start) PAGE_SHIFT; - last_index = (vma-vm_end - vdata-vm_start) PAGE_SHIFT; - for (; index last_index; index++) { + last_index = (vdata-vm_end - vdata-vm_start) PAGE_SHIFT; + for (index=0; index last_index; index++) { if (vdata-maddr[index] == 0) continue; /* @@ -180,20 +179,12 @@ mspec_close(struct vm_area_struct *vma) */ my_page = vdata-maddr[index]; vdata-maddr[index] = 0; - spin_unlock(vdata-lock); - result = mspec_zero_block(my_page, PAGE_SIZE); - if (!result) + if (!mspec_zero_block(my_page, PAGE_SIZE)) uncached_free_page(my_page); else printk(KERN_WARNING mspec_close(): - failed to zero page %i\n, - result); - spin_lock(vdata-lock); + failed to zero page %ld\n, my_page); } - spin_unlock(vdata-lock); - - if (!atomic_dec_and_test(vdata-refcnt)) - return; if (vdata-flags VMD_VMALLOCED) vfree(vdata); @@ -201,7 +192,6 @@ mspec_close(struct vm_area_struct *vma) kfree(vdata); } - /* * mspec_nopfn * - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH][VER 3] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for more revision: Version 3: single thread the clearing of vma_data maddr[] Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm->mmap_sem in the fork() case so the reference count is left as atomic_t. Each section of the vma_data structure may be shared by multiple tasks (forked from the same parent). So single thread mspec_close() during the zeroing of a vma's section. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> Acked-by: Jes Sorensen <[EMAIL PROTECTED]> --- - --- drivers/char/mspec.c | 69 +++-- --- drivers/char/mspec.c | 69 +++ 1 file changed, 48 insertions(+), 21 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma->vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm->mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ - spinlock_t lock;/* Serialize access to the vma. */ + spinlock_t lock;/* Serialize access to this structure. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,44 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; + unsigned long my_page; vdata = vma->vm_private_data; - if (!atomic_dec_and_test(>refcnt)) - return; - pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i < pages; i++) { - if (vdata->maddr[i] == 0) + BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end); + + spin_lock(>lock); + index = (vma->
[PATCH][VER 3] mspec: handle shrinking virtual memory areas
Stress testing revealed the need for more revision: Version 3: single thread the clearing of vma_data maddr[] Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm-mmap_sem in the fork() case so the reference count is left as atomic_t. Each section of the vma_data structure may be shared by multiple tasks (forked from the same parent). So single thread mspec_close() during the zeroing of a vma's section. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] Acked-by: Jes Sorensen [EMAIL PROTECTED] --- - --- drivers/char/mspec.c | 69 +++-- --- drivers/char/mspec.c | 69 +++ 1 file changed, 48 insertions(+), 21 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma-vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm-mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ - spinlock_t lock;/* Serialize access to the vma. */ + spinlock_t lock;/* Serialize access to this structure. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,44 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; + unsigned long my_page; vdata = vma-vm_private_data; - if (!atomic_dec_and_test(vdata-refcnt)) - return; - pages = (vma-vm_end - vma-vm_start) PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i pages; i++) { - if (vdata-maddr[i] == 0) + BUG_ON(vma-vm_start vdata-vm_start || vma-vm_end vdata-vm_end); + + spin_lock(vdata-lock); + index = (vma-vm_start - vdata-vm_start) PAGE_SHIFT; + last_index = (vma-vm_end - vdata
[PATCH][REVISED] mspec: handle shrinking virtual memory areas
Version 2: refcount maintained as atomic_t (as before the version 1 patch) (Diffed against 2.6.23-rc5, not "2.6.13-rc5" !) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm->mmap_sem in the fork() case so the reference count is left as atomic_t. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> Acked-by: Jes Sorensen <[EMAIL PROTECTED]> - --- drivers/char/mspec.c | 61 ++- 1 file changed, 41 insertions(+), 20 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma->vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm->mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma->vm_private_data; - if (!atomic_dec_and_test(>refcnt)) - return; - pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i < pages; i++) { - if (vdata->maddr[i] == 0) + BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end); + + index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT; + last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT; + for (; index < last_index; index++) { + if (vdata->maddr[index] == 0) continue; /* * Clear the page before sticking it back * into the pool. */ - result = mspec_zero_bloc
[PATCH][REVISED] mspec: handle shrinking virtual memory areas
Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm->mmap_sem in the fork() case so the reference count is left as atomic_t. Diffed against 2.6.13-rc5 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> Acked-by: Jes Sorensen <[EMAIL PROTECTED]> - --- drivers/char/mspec.c | 61 ++- 1 file changed, 41 insertions(+), 20 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma->vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm->mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma->vm_private_data; - if (!atomic_dec_and_test(>refcnt)) - return; - pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i < pages; i++) { - if (vdata->maddr[i] == 0) + BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end); + + index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT; + last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT; + for (; index < last_index; index++) { + if (vdata->maddr[index] == 0) continue; /* * Clear the page before sticking it back * into the pool. */ - result = mspec_zero_block(vdata->maddr[i], PAGE_SIZE); +
[PATCH][REVISED] mspec: handle shrinking virtual memory areas
Version 2: refcount maintained as atomic_t (as before the version 1 patch) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm-mmap_sem in the fork() case so the reference count is left as atomic_t. Diffed against 2.6.13-rc5 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] Acked-by: Jes Sorensen [EMAIL PROTECTED] - --- drivers/char/mspec.c | 61 ++- 1 file changed, 41 insertions(+), 20 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma-vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm-mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma-vm_private_data; - if (!atomic_dec_and_test(vdata-refcnt)) - return; - pages = (vma-vm_end - vma-vm_start) PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i pages; i++) { - if (vdata-maddr[i] == 0) + BUG_ON(vma-vm_start vdata-vm_start || vma-vm_end vdata-vm_end); + + index = (vma-vm_start - vdata-vm_start) PAGE_SHIFT; + last_index = (vma-vm_end - vdata-vm_start) PAGE_SHIFT; + for (; index last_index; index++) { + if (vdata-maddr[index] == 0) continue; /* * Clear the page before sticking it back * into the pool. */ - result = mspec_zero_block(vdata-maddr[i], PAGE_SIZE); + result = mspec_zero_block(vdata-maddr[index], PAGE_SIZE); if (!result) - uncached_free_page(vdata-maddr[i
[PATCH][REVISED] mspec: handle shrinking virtual memory areas
Version 2: refcount maintained as atomic_t (as before the version 1 patch) (Diffed against 2.6.23-rc5, not 2.6.13-rc5 !) The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The shared vma_data's are not protected by mm-mmap_sem in the fork() case so the reference count is left as atomic_t. Diffed against 2.6.23-rc5 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] Acked-by: Jes Sorensen [EMAIL PROTECTED] - --- drivers/char/mspec.c | 61 ++- 1 file changed, 41 insertions(+), 20 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,25 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma-vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented atomically because mm-mmap_sem does not + * protect in fork case where multiple tasks share the vma_data. */ struct vma_data { atomic_t refcnt;/* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +139,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -151,34 +161,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma-vm_private_data; - if (!atomic_dec_and_test(vdata-refcnt)) - return; - pages = (vma-vm_end - vma-vm_start) PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i pages; i++) { - if (vdata-maddr[i] == 0) + BUG_ON(vma-vm_start vdata-vm_start || vma-vm_end vdata-vm_end); + + index = (vma-vm_start - vdata-vm_start) PAGE_SHIFT; + last_index = (vma-vm_end - vdata-vm_start) PAGE_SHIFT; + for (; index last_index; index++) { + if (vdata-maddr[index] == 0) continue; /* * Clear the page before sticking it back * into the pool. */ - result = mspec_zero_block(vdata-maddr[i], PAGE_SIZE); + result = mspec_zero_block(vdata-maddr[index], PAGE_SIZE); if (!result
[PATCH 1] mspec: handle shrinking virtual memory areas
The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The vma's are protected by mm->mmap_sem, so the reference count was changed from an atomic_t to an int. Diffed against 2.6.13-rc5 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> Acked-by: Jes Sorensen <[EMAIL PROTECTED]> - --- drivers/char/mspec.c | 68 +-- 1 file changed, 45 insertions(+), 23 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,26 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma->vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented non-atomically because all paths leading + * to mspec_open() and mspec_close() are single threaded by the exclusive + * locking of mm->mmap_sem. */ struct vma_data { - atomic_t refcnt;/* Number of vmas sharing the data. */ + int refcnt; /* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +140,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -138,7 +149,7 @@ mspec_open(struct vm_area_struct *vma) struct vma_data *vdata; vdata = vma->vm_private_data; - atomic_inc(>refcnt); + vdata->refcnt++; } /* @@ -151,34 +162,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma->vm_private_data; - if (!atomic_dec_and_test(>refcnt)) - return; - pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i < pages; i++) { - if (vdata->maddr[i] == 0) + BUG_ON(vma->vm_start < vdata->vm_start || vma->vm_end > vdata->vm_end); + + index = (vma->vm_start - vdata->vm_start) >> PAGE_SHIFT; + last_index = (vma->vm_end - vdata->vm_start) >> PAGE_SHIFT; + for (; index < last_index; index++) { + if (vdata->maddr[index] == 0) continue;
[PATCH 1] mspec: handle shrinking virtual memory areas
The shrinking of a virtual memory area that is mmap(2)'d to a memory special file (device drivers/char/mspec.c) can cause a panic. If the mapped size of the vma (vm_area_struct) is very large, mspec allocates a large vma_data structure with vmalloc(). But such a vma can be shrunk by an munmap(2). The current driver uses the current size of each vma to deduce whether its vma_data structure was allocated by kmalloc() or vmalloc(). So if the vma was shrunk it appears to have been allocated by kmalloc(), and mspec attempts to free it with kfree(). This results in a panic. This patch avoids the panic (by preserving the type of the allocation) and also makes mspec work correctly as the vma is split into pieces by the munmap(2)'s. All vma's derived from such a split vma share the same vma_data structure that represents all the pages mapped into this set of vma's. The mpec driver must be made capable of using the right portion of the structure for each member vma. In other words, it must index into the array of page addresses using the portion of the array that represents the current vma. This is enabled by storing the vma group's vm_start in the vma_data structure. The vma's are protected by mm-mmap_sem, so the reference count was changed from an atomic_t to an int. Diffed against 2.6.13-rc5 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] Acked-by: Jes Sorensen [EMAIL PROTECTED] - --- drivers/char/mspec.c | 68 +-- 1 file changed, 45 insertions(+), 23 deletions(-) Index: mspec_community/drivers/char/mspec.c === --- mspec_community.orig/drivers/char/mspec.c +++ mspec_community/drivers/char/mspec.c @@ -67,7 +67,7 @@ /* * Page types allocated by the device. */ -enum { +enum mspec_page_type { MSPEC_FETCHOP = 1, MSPEC_CACHED, MSPEC_UNCACHED @@ -83,15 +83,26 @@ static int is_sn2; * One of these structures is allocated when an mspec region is mmaped. The * structure is pointed to by the vma-vm_private_data field in the vma struct. * This structure is used to record the addresses of the mspec pages. + * This structure is shared by all vma's that are split off from the + * original vma when split_vma()'s are done. + * + * The refcnt is incremented non-atomically because all paths leading + * to mspec_open() and mspec_close() are single threaded by the exclusive + * locking of mm-mmap_sem. */ struct vma_data { - atomic_t refcnt;/* Number of vmas sharing the data. */ + int refcnt; /* Number of vmas sharing the data. */ spinlock_t lock;/* Serialize access to the vma. */ int count; /* Number of pages allocated. */ - int type; /* Type of pages allocated. */ + enum mspec_page_type type; /* Type of pages allocated. */ + int flags; /* See VMD_xxx below. */ + unsigned long vm_start; /* Original (unsplit) base. */ + unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; +#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ + /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -129,8 +140,8 @@ mspec_zero_block(unsigned long addr, int * mspec_open * * Called when a device mapping is created by a means other than mmap - * (via fork, etc.). Increments the reference count on the underlying - * mspec data so it is not freed prematurely. + * (via fork, munmap, etc.). Increments the reference count on the + * underlying mspec data so it is not freed prematurely. */ static void mspec_open(struct vm_area_struct *vma) @@ -138,7 +149,7 @@ mspec_open(struct vm_area_struct *vma) struct vma_data *vdata; vdata = vma-vm_private_data; - atomic_inc(vdata-refcnt); + vdata-refcnt++; } /* @@ -151,34 +162,38 @@ static void mspec_close(struct vm_area_struct *vma) { struct vma_data *vdata; - int i, pages, result, vdata_size; + int index, last_index, result; vdata = vma-vm_private_data; - if (!atomic_dec_and_test(vdata-refcnt)) - return; - pages = (vma-vm_end - vma-vm_start) PAGE_SHIFT; - vdata_size = sizeof(struct vma_data) + pages * sizeof(long); - for (i = 0; i pages; i++) { - if (vdata-maddr[i] == 0) + BUG_ON(vma-vm_start vdata-vm_start || vma-vm_end vdata-vm_end); + + index = (vma-vm_start - vdata-vm_start) PAGE_SHIFT; + last_index = (vma-vm_end - vdata-vm_start) PAGE_SHIFT; + for (; index last_index; index++) { + if (vdata-maddr[index] == 0) continue; /* * Clear the page before sticking it back * into the pool
[PATCH 1/1] hotplug cpu: documentation addition to downing a cpu
In answer to Andrew: > How do we communicate this new design/feature to our users? > Documentation/cpusets.txt, perhaps? Documentation/cpu-hotplug.txt? > git-log? ;) Patch "[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset" may warrant an addition to the documentation. I would propose this note in cpu-hotplug.txt. Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- Documentation/cpu-hotplug.txt |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linus.070821/Documentation/cpu-hotplug.txt === --- linus.070821.orig/Documentation/cpu-hotplug.txt +++ linus.070821/Documentation/cpu-hotplug.txt @@ -220,7 +220,9 @@ A: The following happen, listed in no pa CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the CPU is being offlined while tasks are frozen due to a suspend operation in progress -- All process is migrated away from this outgoing CPU to a new CPU +- All processes are migrated away from this outgoing CPU to new CPUs. + The new CPU is chosen from each process' current cpuset, which may be + a subset of all online CPUs. - All interrupts targeted to this CPU is migrated to a new CPU - timers/bottom half/task lets are also migrated to a new CPU - Once all services are migrated, kernel calls an arch specific routine -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset
Version 4 calls cpuset_cpus_allowed_locked() outside of the task_rq_lock to avoid the lock conflict that Oleg noticed. And it consolidates what would have been very similar "if (dest_cpu == NR_CPUS)" paths. Also Oleg's observation. Version 3 added a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found) There was discussion about this patch among: Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell regarding other approaches: refusing to offline a cpu with tasks pinned to it, or providing an administrator the ability to assign such tasks to other cpus There is indeed an "assumption" in my patch that the cpuset containing a pinned task's cpu is a better choice than any online cpu. I think that is a reasonable assumption given the typical construction of a cpuset and the reason a task is running in a cpuset. And there will be coming cases, at least on some architectures, where a cpu is offlined as a kernel reaction to a hardware error. In that case would it not be preferrable to re-pin such tasks and let them proceed? When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that will not block. (name change - per Oleg's suggestion) Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- include/linux/cpuset.h |5 + kernel/cpuset.c| 15 ++- kernel/sched.c | 13 - 3 files changed, 31 insertions(+), 2 deletions(-) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -5093,8 +5094,16 @@ restart: /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(); + /* +* Try to stay on the same cpuset, where the current cpuset +* may be a subset of all cpus. +* The cpuset_cpus_allowed_locked() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ rq = task_rq_lock(p, ); - cpus_setall(p->cpus_allowed); + p->cpus_allowed = cpus_allowed; dest_cpu = any_online_cpu(p->cpus_allowed); task_rq_unlock(rq, ); @@ -5412,6 +5421,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5425,6 +5435,7 @@ migration_call(struct notifier_block *nf rq->idle->sched_class = _sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(>lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void); extern void cpuset_for
[PATCH 1/1] V3: hotplug cpu: migrate a task within its cpuset
Version 3 adds a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found) There was discussion about this patch among: Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell regarding other approaches: refusing to offline a cpu with tasks pinned to it, or providing an administrator the ability to assign such tasks to other cpus There is indeed an "assumption" in my patch that the cpuset containing a pinned task's cpu is a better choice than any online cpu. I think that is a reasonable assumption given the typical construction of a cpuset and the reason a task is running in a cpuset. And there will be coming cases, at least on some architectures, where a cpu is offlined as a kernel reaction to a hardware error. In that case would it not be preferrable to re-pin such tasks and let them proceed? When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that will not block. (name change - per Oleg's suggestion) Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- include/linux/cpuset.h |5 + kernel/cpuset.c| 15 ++- kernel/sched.c | 16 3 files changed, 35 insertions(+), 1 deletion(-) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -5091,6 +5092,19 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p->cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(p, ); + /* +* The cpuset_cpus_allowed_locked() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ + p->cpus_allowed = cpuset_cpus_allowed_locked(p); + dest_cpu = any_online_cpu(p->cpus_allowed); + task_rq_unlock(rq, ); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, ); @@ -5412,6 +5426,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5425,6 +5440,7 @@ migration_call(struct notifier_block *nf rq->idle->sched_class = _sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(>lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void); extern void cpuset_fork(struct task_struct *p); extern void cpuset_exit(struct task_struct *p); extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); +extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p); extern nodemask_t cpuset_mems_
[PATCH 1/1] V3: hotplug cpu: migrate a task within its cpuset
Version 3 adds a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found) There was discussion about this patch among: Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell regarding other approaches: refusing to offline a cpu with tasks pinned to it, or providing an administrator the ability to assign such tasks to other cpus There is indeed an assumption in my patch that the cpuset containing a pinned task's cpu is a better choice than any online cpu. I think that is a reasonable assumption given the typical construction of a cpuset and the reason a task is running in a cpuset. And there will be coming cases, at least on some architectures, where a cpu is offlined as a kernel reaction to a hardware error. In that case would it not be preferrable to re-pin such tasks and let them proceed? When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that will not block. (name change - per Oleg's suggestion) Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- include/linux/cpuset.h |5 + kernel/cpuset.c| 15 ++- kernel/sched.c | 16 3 files changed, 35 insertions(+), 1 deletion(-) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include linux/delayacct.h #include linux/reciprocal_div.h #include linux/unistd.h +#include linux/cpuset.h #include asm/tlb.h @@ -5091,6 +5092,19 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p-cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(p, flags); + /* +* The cpuset_cpus_allowed_locked() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ + p-cpus_allowed = cpuset_cpus_allowed_locked(p); + dest_cpu = any_online_cpu(p-cpus_allowed); + task_rq_unlock(rq, flags); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, flags); @@ -5412,6 +5426,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq-migration_thread); @@ -5425,6 +5440,7 @@ migration_call(struct notifier_block *nf rq-idle-sched_class = idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(rq-lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq-nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void); extern void cpuset_fork(struct task_struct *p); extern void cpuset_exit(struct task_struct *p); extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); +extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p
[PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset
Version 4 calls cpuset_cpus_allowed_locked() outside of the task_rq_lock to avoid the lock conflict that Oleg noticed. And it consolidates what would have been very similar if (dest_cpu == NR_CPUS) paths. Also Oleg's observation. Version 3 added a missing task_rq_lock()/task_rq_unlock() pair. (Oleg found) There was discussion about this patch among: Andrew Morton, Oleg Nesterov, Gautham Shenoy, Rusty Russell regarding other approaches: refusing to offline a cpu with tasks pinned to it, or providing an administrator the ability to assign such tasks to other cpus There is indeed an assumption in my patch that the cpuset containing a pinned task's cpu is a better choice than any online cpu. I think that is a reasonable assumption given the typical construction of a cpuset and the reason a task is running in a cpuset. And there will be coming cases, at least on some architectures, where a cpu is offlined as a kernel reaction to a hardware error. In that case would it not be preferrable to re-pin such tasks and let them proceed? When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that will not block. (name change - per Oleg's suggestion) Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- include/linux/cpuset.h |5 + kernel/cpuset.c| 15 ++- kernel/sched.c | 13 - 3 files changed, 31 insertions(+), 2 deletions(-) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include linux/delayacct.h #include linux/reciprocal_div.h #include linux/unistd.h +#include linux/cpuset.h #include asm/tlb.h @@ -5093,8 +5094,16 @@ restart: /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(); + /* +* Try to stay on the same cpuset, where the current cpuset +* may be a subset of all cpus. +* The cpuset_cpus_allowed_locked() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ rq = task_rq_lock(p, flags); - cpus_setall(p-cpus_allowed); + p-cpus_allowed = cpus_allowed; dest_cpu = any_online_cpu(p-cpus_allowed); task_rq_unlock(rq, flags); @@ -5412,6 +5421,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq-migration_thread); @@ -5425,6 +5435,7 @@ migration_call(struct notifier_block *nf rq-idle-sched_class = idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(rq-lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq-nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void
[PATCH 1/1] hotplug cpu: documentation addition to downing a cpu
In answer to Andrew: How do we communicate this new design/feature to our users? Documentation/cpusets.txt, perhaps? Documentation/cpu-hotplug.txt? git-log? ;) Patch [PATCH 1/1] V4: hotplug cpu: migrate a task within its cpuset may warrant an addition to the documentation. I would propose this note in cpu-hotplug.txt. Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- Documentation/cpu-hotplug.txt |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linus.070821/Documentation/cpu-hotplug.txt === --- linus.070821.orig/Documentation/cpu-hotplug.txt +++ linus.070821/Documentation/cpu-hotplug.txt @@ -220,7 +220,9 @@ A: The following happen, listed in no pa CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the CPU is being offlined while tasks are frozen due to a suspend operation in progress -- All process is migrated away from this outgoing CPU to a new CPU +- All processes are migrated away from this outgoing CPU to new CPUs. + The new CPU is chosen from each process' current cpuset, which may be + a subset of all online CPUs. - All interrupts targeted to this CPU is migrated to a new CPU - timers/bottom half/task lets are also migrated to a new CPU - Once all services are migrated, kernel calls an arch specific routine -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/1] hotplug cpu: migrate a task within its cpuset
When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_lock(), a new variant of cpuset_cpus_allowed() that will not block. Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- include/linux/cpuset.h |5 + kernel/cpuset.c| 19 +++ kernel/sched.c | 14 ++ 3 files changed, 38 insertions(+) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -5091,6 +5092,17 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p->cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + /* +* The cpuset_cpus_allowed_lock() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ + p->cpus_allowed = cpuset_cpus_allowed_lock(p); + dest_cpu = any_online_cpu(p->cpus_allowed); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, ); @@ -5412,6 +5424,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5425,6 +5438,7 @@ migration_call(struct notifier_block *nf rq->idle->sched_class = _sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(>lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void); extern void cpuset_fork(struct task_struct *p); extern void cpuset_exit(struct task_struct *p); extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); +extern cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -87,6 +88,10 @@ static inline cpumask_t cpuset_cpus_allo { return cpu_possible_map; } +static inline cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p) +{ + return cpu_possible_map; +} static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { Index: linus.070821/kernel/cpuset.c === --- linus.070821.orig/kernel/cpuset.c +++ linus.070821/kernel/cpuset.c @@ -2333,6 +2333,25 @@ cpumask_t cpuset_cpus_allowed(struct tas return mask; } +/** + * cpuset_cpus_allowed_lock - return cpus_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * + * Description: Same as cpuset_cpus_allowed, but called with callback_mutex + * already held. + **/ + +cpumask_t cpuset_cpus_
[PATCH 1/1] hotplug cpu: migrate a task within its cpuset
When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch would insert a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed_lock(), a new variant of cpuset_cpus_allowed() that will not block. Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to set the cpuset mutex during the whole migrate_live_tasks() and migrate_dead_tasks() procedure. This patch depends on 2 patches from Oleg Nesterov: [PATCH 1/2] do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(tasklist) [PATCH 2/2] migration_call(CPU_DEAD): use spin_lock_irq() instead of task_rq_lock() Diffed against 2.6.23-rc3 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- include/linux/cpuset.h |5 + kernel/cpuset.c| 19 +++ kernel/sched.c | 14 ++ 3 files changed, 38 insertions(+) Index: linus.070821/kernel/sched.c === --- linus.070821.orig/kernel/sched.c +++ linus.070821/kernel/sched.c @@ -61,6 +61,7 @@ #include linux/delayacct.h #include linux/reciprocal_div.h #include linux/unistd.h +#include linux/cpuset.h #include asm/tlb.h @@ -5091,6 +5092,17 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p-cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + /* +* The cpuset_cpus_allowed_lock() variant of +* cpuset_cpus_allowed() will not block +* It must be called within calls to cpuset_lock/cpuset_unlock. +*/ + p-cpus_allowed = cpuset_cpus_allowed_lock(p); + dest_cpu = any_online_cpu(p-cpus_allowed); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, flags); @@ -5412,6 +5424,7 @@ migration_call(struct notifier_block *nf case CPU_DEAD: case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq-migration_thread); @@ -5425,6 +5438,7 @@ migration_call(struct notifier_block *nf rq-idle-sched_class = idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(rq-lock); + cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq-nr_running != 0); Index: linus.070821/include/linux/cpuset.h === --- linus.070821.orig/include/linux/cpuset.h +++ linus.070821/include/linux/cpuset.h @@ -22,6 +22,7 @@ extern void cpuset_init_smp(void); extern void cpuset_fork(struct task_struct *p); extern void cpuset_exit(struct task_struct *p); extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); +extern cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current-mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -87,6 +88,10 @@ static inline cpumask_t cpuset_cpus_allo { return cpu_possible_map; } +static inline cpumask_t cpuset_cpus_allowed_lock(struct task_struct *p) +{ + return cpu_possible_map; +} static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { Index: linus.070821/kernel/cpuset.c === --- linus.070821.orig/kernel/cpuset.c +++ linus.070821/kernel/cpuset.c @@ -2333,6 +2333,25 @@ cpumask_t cpuset_cpus_allowed(struct tas return mask; } +/** + * cpuset_cpus_allowed_lock - return cpus_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset-cpus_allowed. + * + * Description: Same as cpuset_cpus_allowed, but called with callback_mutex
ATA scsi driver misbehavior under kdump capture kernel
I've run into a problem with the ATA SCSI disk driver when running in a kdump dump-capture kernel. I'm running on 2-processor x86_64 box. It has 2 scsi disks, /dev/sda and /dev/sdb My kernel is 2.6.22, and built to be a dump capturing kernel loaded by kexec. When I boot this kernel by itself, it finds both sda and sdb. But when it is loaded by kexec and booted on a panic it only finds sda. Any ideas from those familiar with the ATA driver? -Cliff Wickman SGI I put some printk's into it and get this: Standalone: [nv_adma_error_handler] cpw: ata_host_register probe port 1 (error_handler:81348625) cpw: ata_host_register call ata_port_probe cpw: ata_host_register call ata_port_schedule cpw: ata_host_register call ata_port_wait_eh cpw: ata_port_wait_eh entered cpw: ata_port_wait_eh, preparing to wait ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) cpw: ata_dev_configure entered cpw: ata_dev_configure testing class cpw: ata_dev_configure class is ATA_DEV_ATA ata2.00: ATA-6: ST3200822AS, 3.01, max UDMA/133 ata2.00: 390721968 sectors, multi 16: LBA48 cpw: ata_dev_configure exiting cpw: ata_dev_configure entered cpw: ata_dev_configure testing class cpw: ata_dev_configure class is ATA_DEV_ATA cpw: ata_dev_configure exiting cpw: ata_dev_set_mode printing: ata2.00: configured for UDMA/133 cpw: ata_port_wait_eh, finished wait cpw: ata_port_wait_eh exiting cpw: ata_host_register done with probe port 1 When loaded with kexec and booted on a panic: cpw: ata_host_register probe port 1 (error_handler:81348625) cpw: ata_host_register call ata_port_probe cpw: ata_host_register call ata_port_schedule cpw: ata_host_register call ata_port_wait_eh cpw: ata_port_wait_eh entered cpw: ata_port_wait_eh, preparing to wait ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) cpw: ata_port_wait_eh, finished wait cpw: ata_port_wait_eh exiting cpw: ata_host_register done with probe port 1 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
ATA scsi driver misbehavior under kdump capture kernel
I've run into a problem with the ATA SCSI disk driver when running in a kdump dump-capture kernel. I'm running on 2-processor x86_64 box. It has 2 scsi disks, /dev/sda and /dev/sdb My kernel is 2.6.22, and built to be a dump capturing kernel loaded by kexec. When I boot this kernel by itself, it finds both sda and sdb. But when it is loaded by kexec and booted on a panic it only finds sda. Any ideas from those familiar with the ATA driver? -Cliff Wickman SGI I put some printk's into it and get this: Standalone: [nv_adma_error_handler] cpw: ata_host_register probe port 1 (error_handler:81348625) cpw: ata_host_register call ata_port_probe cpw: ata_host_register call ata_port_schedule cpw: ata_host_register call ata_port_wait_eh cpw: ata_port_wait_eh entered cpw: ata_port_wait_eh, preparing to wait ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) cpw: ata_dev_configure entered cpw: ata_dev_configure testing class cpw: ata_dev_configure class is ATA_DEV_ATA ata2.00: ATA-6: ST3200822AS, 3.01, max UDMA/133 ata2.00: 390721968 sectors, multi 16: LBA48 cpw: ata_dev_configure exiting cpw: ata_dev_configure entered cpw: ata_dev_configure testing class cpw: ata_dev_configure class is ATA_DEV_ATA cpw: ata_dev_configure exiting cpw: ata_dev_set_mode printing: ata2.00: configured for UDMA/133 cpw: ata_port_wait_eh, finished wait cpw: ata_port_wait_eh exiting cpw: ata_host_register done with probe port 1 When loaded with kexec and booted on a panic: cpw: ata_host_register probe port 1 (error_handler:81348625) cpw: ata_host_register call ata_port_probe cpw: ata_host_register call ata_port_schedule cpw: ata_host_register call ata_port_wait_eh cpw: ata_port_wait_eh entered cpw: ata_port_wait_eh, preparing to wait ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) cpw: ata_port_wait_eh, finished wait cpw: ata_port_wait_eh exiting cpw: ata_host_register done with probe port 1 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] hotplug cpu: migrate a task within its cpuset
On Thu, May 24, 2007 at 01:29:02AM +0400, Oleg Nesterov wrote: > Cliff Wickman wrote: > > > > In order to do this, move_task_off_dead_cpu() must make a call to > > cpuset_cpus_allowed(), which may block. > > > > move_task_off_dead_cpu() has been within a critical region when called > > from migrate_live_tasks(). So this patch also changes migrate_live_tasks() > > to enable interrupts before calling move_task_off_dead_cpu(). > > Since the tasklist_lock is dropped, the list scan must be restarted from > > the top. > > > > [... snip ...] > > > > - * NOTE: interrupts should be disabled by the caller > > + * NOTE: interrupts are not disabled by the caller > > */ > > static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) > > { > > @@ -5008,6 +5008,17 @@ restart: > > if (dest_cpu == NR_CPUS) > > dest_cpu = any_online_cpu(p->cpus_allowed); > > > > + /* try to stay on the same cpuset */ > > + if (dest_cpu == NR_CPUS) { > > + /* > > +* Call to cpuset_cpus_allowed may sleep, so we depend > > +* on move_task_off_dead_cpu() being called in a non-critical > > +* region. > > +*/ > > + p->cpus_allowed = cpuset_cpus_allowed(p); > > + dest_cpu = any_online_cpu(p->cpus_allowed); > > + } > > I know nothing about cpuset.c, a _very_ naive question. Paul Jackson is the cpuset guru. > Do we really need task_lock() (used by cpuset_cpus_allowed) here ? According to Paul's comment in kernel/cpuset.c * It is ok to first take manage_sem, then nest callback_sem. We also * require taking task_lock() when dereferencing a tasks cpuset pointer. So I'm afraid it is not safe to call guarantee_online_cpus(tsk->cpuset, ); without it. Could the task not be exiting? > If not, probably we can make this simpler. CPU_DEAD takes cpuset_lock(), > move_task_off_dead_cpu() uses guarantee_online_cpus() which doesn't sleep, > so we don't need other changes. > > Possible? > > If not, this patch should also change migrate_dead(), it still calls > move_task_off_dead_cpu() with irqs disabled, no? Right, the lock is released but I indeed didn't reenable irqs. How would you suggest doing that? The irq state was saved in local variable "flags" back in migration_call(). > > Oleg. -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] hotplug cpu: migrate a task within its cpuset
On Thu, May 24, 2007 at 01:29:02AM +0400, Oleg Nesterov wrote: Cliff Wickman wrote: In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed(), which may block. move_task_off_dead_cpu() has been within a critical region when called from migrate_live_tasks(). So this patch also changes migrate_live_tasks() to enable interrupts before calling move_task_off_dead_cpu(). Since the tasklist_lock is dropped, the list scan must be restarted from the top. [... snip ...] - * NOTE: interrupts should be disabled by the caller + * NOTE: interrupts are not disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { @@ -5008,6 +5008,17 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p-cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + /* +* Call to cpuset_cpus_allowed may sleep, so we depend +* on move_task_off_dead_cpu() being called in a non-critical +* region. +*/ + p-cpus_allowed = cpuset_cpus_allowed(p); + dest_cpu = any_online_cpu(p-cpus_allowed); + } I know nothing about cpuset.c, a _very_ naive question. Paul Jackson is the cpuset guru. Do we really need task_lock() (used by cpuset_cpus_allowed) here ? According to Paul's comment in kernel/cpuset.c * It is ok to first take manage_sem, then nest callback_sem. We also * require taking task_lock() when dereferencing a tasks cpuset pointer. So I'm afraid it is not safe to call guarantee_online_cpus(tsk-cpuset, mask); without it. Could the task not be exiting? If not, probably we can make this simpler. CPU_DEAD takes cpuset_lock(), move_task_off_dead_cpu() uses guarantee_online_cpus() which doesn't sleep, so we don't need other changes. Possible? If not, this patch should also change migrate_dead(), it still calls move_task_off_dead_cpu() with irqs disabled, no? Right, the lock is released but I indeed didn't reenable irqs. How would you suggest doing that? The irq state was saved in local variable flags back in migration_call(). Oleg. -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/1] hotplug cpu: migrate a task within its cpuset
(this is a third submission -- corrects a locking/blocking issue pointed out by Nathan Lynch) When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch inserts a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed(), which may block. move_task_off_dead_cpu() has been within a critical region when called from migrate_live_tasks(). So this patch also changes migrate_live_tasks() to enable interrupts before calling move_task_off_dead_cpu(). Since the tasklist_lock is dropped, the list scan must be restarted from the top. It locks the migrating task by bumping its usage count. It disables interrupts in move_task_off_dead_cpu() before the call to __migrate_task(). This is the outline of the locking surrounding calls to move_task_off_dead_cpu(), after applying this patch: migration_call() | case CPU_DEAD | migrate_live_tasks(cpu) | | recheck: | | write_lock_irq(_lock) | | do_each_thread(t, p) { | | if (task_cpu(p) == src_cpu) | | get_task_struct(p) | | write_unlock_irq(_lock) | | move_task_off_dead_cpu(src_cpu, p) <<<< noncritical | | put_task_struct(p); | | goto recheck | | } while_each_thread(t, p) | | write_unlock_irq(_lock) | | rq = task_rq_lock(rq->idle, ) | | migrate_dead_tasks(cpu) | | for (arr = 0; arr < 2; arr++) { | | for (i = 0; i < MAX_PRIO; i++) { | | while (!list_empty(list)) | | migrate_dead(dead_cpu | | get_task_struct(p) | | spin_unlock_irq(>lock) | | move_task_off_dead_cpu(dead_cpu, p)<<<< noncritcal | | spin_lock_irq(>lock) | | put_task_struct(p) | | task_rq_unlock(rq, ) [Side note: a task may be migrated off of its cpuset, but is still attached to that cpuset (by pointer and reference count). The cpuset will not be released. This patch does not change that.] Diffed against 2.6.21 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> kernel/sched.c | 31 --- 1 file changed, 28 insertions(+), 3 deletions(-) Index: linus.070504/kernel/sched.c === --- linus.070504.orig/kernel/sched.c +++ linus.070504/kernel/sched.c @@ -4989,7 +4989,7 @@ wait_to_die: #ifdef CONFIG_HOTPLUG_CPU /* * Figure out where task on dead CPU should go, use force if neccessary. - * NOTE: interrupts should be disabled by the caller + * NOTE: interrupts are not disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { @@ -5008,6 +5008,17 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p->cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + /* +* Call to cpuset_cpus_allowed may sleep, so we depend +* on move_task_off_dead_cpu() being called in a non-critical +* region. +*/ + p->cpus_allowed = cpuset_cpus_allowed(p); + dest_cpu = any_online_cpu(p->cpus_allowed); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, ); @@ -5025,8 +5036,16 @@ restart: "longer affine to cpu%d\n", p->pid, p->comm, dead_cpu); } - if (!__migrate_task(p, dead_cpu, dest_cpu)) + /* +* __migrate_task() requires interrupts to be disabled +*/ + local_irq_disable(); + if (!__migrate_task(p, dead_cpu, dest_cpu)) { + local_irq_enable(); goto restart; + } + local_irq_enable(); + return; } /* @@ -5054,14 +5073,20 @@ static void migrate_live_tasks(int src_c {
[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
This patch corrects a situation that occurs when one disables all the cpus in a cpuset. At that point, any tasks in that cpuset were incorrectly moved. (Disabling all cpus in a cpuset caused it to inherit the cpus of its parent, which may overlap its exclusive sibling.) Such tasks should be moved to the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be removed (if it is flagged notify_on_release). This patch uses a workqueue thread to call the function that deletes the cpuset. That way we avoid the complexity of the cpuset locks. (I've been working with Paul Jackson on this patch, and there is still a little functional subtlety to work out. Can be tweaked later.) Diffed against 2.6.21 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 221 1 file changed, 191 insertions(+), 30 deletions(-) Index: linus.070504/kernel/cpuset.c === --- linus.070504.orig/kernel/cpuset.c +++ linus.070504/kernel/cpuset.c @@ -54,6 +54,7 @@ #include #include #include +#include #define CPUSET_SUPER_MAGIC 0x27e0eb @@ -111,6 +112,7 @@ typedef enum { CS_NOTIFY_ON_RELEASE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, + CS_RELEASED_RESOURCE, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -149,6 +151,11 @@ static inline int is_spread_slab(const s return test_bit(CS_SPREAD_SLAB, >flags); } +static inline int has_released_a_resource(const struct cpuset *cs) +{ + return test_bit(CS_RELEASED_RESOURCE, >flags); +} + /* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -543,7 +550,7 @@ static void cpuset_release_agent(const c static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(>count) == 0 && - list_empty(>children)) { + list_empty(>children)) { char *buf; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); @@ -835,7 +842,8 @@ update_cpu_domains_tree(struct cpuset *r while (__kfifo_get(queue, (unsigned char *), sizeof(cp))) { list_for_each_entry(child, >children, sibling) - __kfifo_put(queue,(unsigned char *),sizeof(child)); + __kfifo_put(queue, (unsigned char *), + sizeof(child)); update_cpu_domains(cp); } @@ -1101,7 +1109,7 @@ static int update_flag(cpuset_flagbits_t mutex_unlock(_mutex); if (cpu_exclusive_changed) -update_cpu_domains_tree(cs); + update_cpu_domains_tree(cs); return 0; } @@ -1279,6 +1287,7 @@ static int attach_task(struct cpuset *cs from = oldcs->mems_allowed; to = cs->mems_allowed; + set_bit(CS_RELEASED_RESOURCE, >flags); mutex_unlock(_mutex); @@ -1361,6 +1370,10 @@ static ssize_t cpuset_common_file_write( retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); break; case FILE_NOTIFY_ON_RELEASE: + /* Even if the cpuset had been emptied in the past + it must not be considered for release until it has + become non-empty again. */ + clear_bit(CS_RELEASED_RESOURCE, >flags); retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_MEMORY_MIGRATE: @@ -2014,6 +2027,7 @@ static int cpuset_rmdir(struct inode *un cpuset_d_remove_dir(d); dput(d); number_of_cpusets--; + set_bit(CS_RELEASED_RESOURCE, >flags); mutex_unlock(_mutex); if (list_empty(>children)) check_for_release(parent, ); @@ -2081,50 +2095,188 @@ out: } /* + * Move every task that is a member of cpuset "from" to cpuset "to". + * + * Called with both manage_sem and callback_sem held + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + int moved=0; + struct task_struct *g, *tsk; + + read_lock(_lock); + do_each_thread(g, tsk) { + if (tsk->cpuset == from) { + moved++; + task_lock(tsk); + tsk->cpuset = to; + task_unlock(tsk); + } + } while_each_thread(g, tsk); + read_unlock(_lock); + atomic_add(moved, >count); + atomic_set(>count, 0); +} + +/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpuset
[PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation
This patch reconciles cpusets and sched_domains that get out of sync due to hotplug disabling and re-enabling of cpu's. Here is an example of how the problem can occur: system of cpu's 0-31 create cpuset /x 16-31 create cpuset /x/y 16-23 all cpu_exclusive disable cpu 17 x is now16,18-31 x/y is now 16,18-23 enable cpu 17 x and x/y are unchanged to restore the cpusets: echo 16-31 > /dev/cpuset/x echo 16-23 > /dev/cpuset/x/y At the first echo, update_cpu_domains() is called for cpuset x/. The system is partitioned between: its parent, the root cpuset of 0-31, minus its children (x/ is 16-31): 0-15 and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31 The sched_domain's for parent 0-15 are updated. The sched_domain's for current 17,24-31 are updated. But 16 has been untouched. As a result, 17's SD points to sched_group_phys[17] which is the only sched_group_phys on 17's list. It points to itself. But 16's SD points to sched_group_phys[16], which still points to sched_group_phys[17]. When cpu 16 executes find_busiest_group() it will hang on the non- circular sched_group list. This solution is to update the sched_domain's for the cpuset whose cpu's were changed and, in addition, all its children. Instead of calling update_cpu_domains(), call update_cpu_domains_tree(), which calls update_cpu_domains() for every node from the one specified down to all its children. The extra sched_domain reconstruction is overhead, but only at the frequency of administrative change to the cpuset. There seems to be no administrative procedural work-around. In the example above one could not reverse the two echo's and set x/y before x/. It is not logical, so not allowed (Permission denied). Thus the patch to cpuset.c makes the sched_domain's correct. This patch also includes checks in find_busiest_group() and find_idlest_group() that break from their loops on a sched_group that points to itself. This is needed because cpu's are going through load balancing before all sched_domains have been reconstructed (see the example above). Thus the patch to sched.c prevents the hangs that would otherwise occur until the sched_domain's are made correct. Diffed against 2.6.21 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 43 +++ kernel/sched.c | 18 ++ 2 files changed, 53 insertions(+), 8 deletions(-) Index: linus.070504/kernel/sched.c === --- linus.070504.orig/kernel/sched.c +++ linus.070504/kernel/sched.c @@ -1211,11 +1211,14 @@ static inline unsigned long cpu_avg_load static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *this = sd->groups, *group = sd->groups; + struct sched_group *self, *prev; unsigned long min_load = ULONG_MAX, this_load = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + prev = group; + self = group; do { unsigned long load, avg_load; int local_group; @@ -1251,8 +1254,10 @@ find_idlest_group(struct sched_domain *s idlest = group; } nextgroup: + prev = self; + self = group; group = group->next; - } while (group != sd->groups); + } while (group != sd->groups && group != self && group != prev); if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -2276,7 +2281,8 @@ find_busiest_group(struct sched_domain * unsigned long *imbalance, enum idle_type idle, int *sd_idle, cpumask_t *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *busiest = NULL, *this = sd->groups, *group = sd->groups; + struct sched_group *self, *prev; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; @@ -2299,6 +2305,8 @@ find_busiest_group(struct sched_domain * else load_idx = sd->idle_idx; + prev = group; + self = group; do { unsigned long load, group_capacity; int local_group; @@ -2427,8 +2435,10 @@ find_busiest_group(struct sched_domain * } group_next: #endif + prev = self; + self = group; group = group->next; - } while (group != sd->groups
hotplug cpu: PATCHes for 3 issues
In the 2.6.21 kernel there are still 3 hotplug issues that are cpuset- related, and that I find to still be problems. And for which I offer patches. These have been submitted before, and subsequently cleaned up per comments received. I'm resubmitting all 3 for consideration and further comment. 1) [PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation 2) [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent 3) [PATCH 1/1] hotplug cpu: migrate a task within its cpuset 1) Reconciles cpusets and sched_domains that get out of sync due to hotplug disabling and re-enabling of cpu's. Tasks can get into infinite hangs without this fix. kernel/cpuset.c kernel/sched.c 2) When a cpuset is emptied by disabling its cpus, move tasks to a parent cpuset. This is a correction of the current procedure, which moves such tasks to the wrong cpuset. kernel/cpuset.c 3) Causes a task running on a disabled cpu to migrate to a cpu within its cpuset. This behavior is particularly important for a NUMA system on which tasks have been explicitly placed. kernel/sched.c - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
hotplug cpu: PATCHes for 3 issues
In the 2.6.21 kernel there are still 3 hotplug issues that are cpuset- related, and that I find to still be problems. And for which I offer patches. These have been submitted before, and subsequently cleaned up per comments received. I'm resubmitting all 3 for consideration and further comment. 1) [PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation 2) [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent 3) [PATCH 1/1] hotplug cpu: migrate a task within its cpuset 1) Reconciles cpusets and sched_domains that get out of sync due to hotplug disabling and re-enabling of cpu's. Tasks can get into infinite hangs without this fix. kernel/cpuset.c kernel/sched.c 2) When a cpuset is emptied by disabling its cpus, move tasks to a parent cpuset. This is a correction of the current procedure, which moves such tasks to the wrong cpuset. kernel/cpuset.c 3) Causes a task running on a disabled cpu to migrate to a cpu within its cpuset. This behavior is particularly important for a NUMA system on which tasks have been explicitly placed. kernel/sched.c - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/1] hotplug cpu: cpusets/sched_domain reconciliation
This patch reconciles cpusets and sched_domains that get out of sync due to hotplug disabling and re-enabling of cpu's. Here is an example of how the problem can occur: system of cpu's 0-31 create cpuset /x 16-31 create cpuset /x/y 16-23 all cpu_exclusive disable cpu 17 x is now16,18-31 x/y is now 16,18-23 enable cpu 17 x and x/y are unchanged to restore the cpusets: echo 16-31 /dev/cpuset/x echo 16-23 /dev/cpuset/x/y At the first echo, update_cpu_domains() is called for cpuset x/. The system is partitioned between: its parent, the root cpuset of 0-31, minus its children (x/ is 16-31): 0-15 and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31 The sched_domain's for parent 0-15 are updated. The sched_domain's for current 17,24-31 are updated. But 16 has been untouched. As a result, 17's SD points to sched_group_phys[17] which is the only sched_group_phys on 17's list. It points to itself. But 16's SD points to sched_group_phys[16], which still points to sched_group_phys[17]. When cpu 16 executes find_busiest_group() it will hang on the non- circular sched_group list. This solution is to update the sched_domain's for the cpuset whose cpu's were changed and, in addition, all its children. Instead of calling update_cpu_domains(), call update_cpu_domains_tree(), which calls update_cpu_domains() for every node from the one specified down to all its children. The extra sched_domain reconstruction is overhead, but only at the frequency of administrative change to the cpuset. There seems to be no administrative procedural work-around. In the example above one could not reverse the two echo's and set x/y before x/. It is not logical, so not allowed (Permission denied). Thus the patch to cpuset.c makes the sched_domain's correct. This patch also includes checks in find_busiest_group() and find_idlest_group() that break from their loops on a sched_group that points to itself. This is needed because cpu's are going through load balancing before all sched_domains have been reconstructed (see the example above). Thus the patch to sched.c prevents the hangs that would otherwise occur until the sched_domain's are made correct. Diffed against 2.6.21 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 43 +++ kernel/sched.c | 18 ++ 2 files changed, 53 insertions(+), 8 deletions(-) Index: linus.070504/kernel/sched.c === --- linus.070504.orig/kernel/sched.c +++ linus.070504/kernel/sched.c @@ -1211,11 +1211,14 @@ static inline unsigned long cpu_avg_load static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd-groups; + struct sched_group *idlest = NULL, *this = sd-groups, *group = sd-groups; + struct sched_group *self, *prev; unsigned long min_load = ULONG_MAX, this_load = 0; int load_idx = sd-forkexec_idx; int imbalance = 100 + (sd-imbalance_pct-100)/2; + prev = group; + self = group; do { unsigned long load, avg_load; int local_group; @@ -1251,8 +1254,10 @@ find_idlest_group(struct sched_domain *s idlest = group; } nextgroup: + prev = self; + self = group; group = group-next; - } while (group != sd-groups); + } while (group != sd-groups group != self group != prev); if (!idlest || 100*this_load imbalance*min_load) return NULL; @@ -2276,7 +2281,8 @@ find_busiest_group(struct sched_domain * unsigned long *imbalance, enum idle_type idle, int *sd_idle, cpumask_t *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd-groups; + struct sched_group *busiest = NULL, *this = sd-groups, *group = sd-groups; + struct sched_group *self, *prev; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; @@ -2299,6 +2305,8 @@ find_busiest_group(struct sched_domain * else load_idx = sd-idle_idx; + prev = group; + self = group; do { unsigned long load, group_capacity; int local_group; @@ -2427,8 +2435,10 @@ find_busiest_group(struct sched_domain * } group_next: #endif + prev = self; + self = group; group = group-next; - } while (group != sd-groups); + } while (group != sd-groups group != self group != prev); if (!busiest
[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
This patch corrects a situation that occurs when one disables all the cpus in a cpuset. At that point, any tasks in that cpuset were incorrectly moved. (Disabling all cpus in a cpuset caused it to inherit the cpus of its parent, which may overlap its exclusive sibling.) Such tasks should be moved to the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be removed (if it is flagged notify_on_release). This patch uses a workqueue thread to call the function that deletes the cpuset. That way we avoid the complexity of the cpuset locks. (I've been working with Paul Jackson on this patch, and there is still a little functional subtlety to work out. Can be tweaked later.) Diffed against 2.6.21 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 221 1 file changed, 191 insertions(+), 30 deletions(-) Index: linus.070504/kernel/cpuset.c === --- linus.070504.orig/kernel/cpuset.c +++ linus.070504/kernel/cpuset.c @@ -54,6 +54,7 @@ #include asm/atomic.h #include linux/mutex.h #include linux/kfifo.h +#include linux/workqueue.h #define CPUSET_SUPER_MAGIC 0x27e0eb @@ -111,6 +112,7 @@ typedef enum { CS_NOTIFY_ON_RELEASE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, + CS_RELEASED_RESOURCE, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -149,6 +151,11 @@ static inline int is_spread_slab(const s return test_bit(CS_SPREAD_SLAB, cs-flags); } +static inline int has_released_a_resource(const struct cpuset *cs) +{ + return test_bit(CS_RELEASED_RESOURCE, cs-flags); +} + /* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -543,7 +550,7 @@ static void cpuset_release_agent(const c static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) atomic_read(cs-count) == 0 - list_empty(cs-children)) { + list_empty(cs-children)) { char *buf; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); @@ -835,7 +842,8 @@ update_cpu_domains_tree(struct cpuset *r while (__kfifo_get(queue, (unsigned char *)cp, sizeof(cp))) { list_for_each_entry(child, cp-children, sibling) - __kfifo_put(queue,(unsigned char *)child,sizeof(child)); + __kfifo_put(queue, (unsigned char *)child, + sizeof(child)); update_cpu_domains(cp); } @@ -1101,7 +1109,7 @@ static int update_flag(cpuset_flagbits_t mutex_unlock(callback_mutex); if (cpu_exclusive_changed) -update_cpu_domains_tree(cs); + update_cpu_domains_tree(cs); return 0; } @@ -1279,6 +1287,7 @@ static int attach_task(struct cpuset *cs from = oldcs-mems_allowed; to = cs-mems_allowed; + set_bit(CS_RELEASED_RESOURCE, oldcs-flags); mutex_unlock(callback_mutex); @@ -1361,6 +1370,10 @@ static ssize_t cpuset_common_file_write( retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); break; case FILE_NOTIFY_ON_RELEASE: + /* Even if the cpuset had been emptied in the past + it must not be considered for release until it has + become non-empty again. */ + clear_bit(CS_RELEASED_RESOURCE, cs-flags); retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_MEMORY_MIGRATE: @@ -2014,6 +2027,7 @@ static int cpuset_rmdir(struct inode *un cpuset_d_remove_dir(d); dput(d); number_of_cpusets--; + set_bit(CS_RELEASED_RESOURCE, parent-flags); mutex_unlock(callback_mutex); if (list_empty(parent-children)) check_for_release(parent, pathbuf); @@ -2081,50 +2095,188 @@ out: } /* + * Move every task that is a member of cpuset from to cpuset to. + * + * Called with both manage_sem and callback_sem held + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + int moved=0; + struct task_struct *g, *tsk; + + read_lock(tasklist_lock); + do_each_thread(g, tsk) { + if (tsk-cpuset == from) { + moved++; + task_lock(tsk); + tsk-cpuset = to; + task_unlock(tsk); + } + } while_each_thread(g, tsk); + read_unlock(tasklist_lock); + atomic_add(moved, to-count); + atomic_set(from-count, 0); +} + +/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing
[PATCH 1/1] hotplug cpu: migrate a task within its cpuset
(this is a third submission -- corrects a locking/blocking issue pointed out by Nathan Lynch) When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that have been running on that cpu. Currently, such a task is migrated: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any cpu which is both online and among that task's cpus_allowed It is typical of a multithreaded application running on a large NUMA system to have its tasks confined to a cpuset so as to cluster them near the memory that they share. Furthermore, it is typical to explicitly place such a task on a specific cpu in that cpuset. And in that case the task's cpus_allowed includes only a single cpu. This patch inserts a preference to migrate such a task to some cpu within its cpuset (and set its cpus_allowed to its entire cpuset). With this patch, migrate the task to: 1) to any cpu on the same node as the disabled cpu, which is both online and among that task's cpus_allowed 2) to any online cpu within the task's cpuset 3) to any cpu which is both online and among that task's cpus_allowed In order to do this, move_task_off_dead_cpu() must make a call to cpuset_cpus_allowed(), which may block. move_task_off_dead_cpu() has been within a critical region when called from migrate_live_tasks(). So this patch also changes migrate_live_tasks() to enable interrupts before calling move_task_off_dead_cpu(). Since the tasklist_lock is dropped, the list scan must be restarted from the top. It locks the migrating task by bumping its usage count. It disables interrupts in move_task_off_dead_cpu() before the call to __migrate_task(). This is the outline of the locking surrounding calls to move_task_off_dead_cpu(), after applying this patch: migration_call() | case CPU_DEAD | migrate_live_tasks(cpu) | | recheck: | | write_lock_irq(tasklist_lock) | | do_each_thread(t, p) { | | if (task_cpu(p) == src_cpu) | | get_task_struct(p) | | write_unlock_irq(tasklist_lock) | | move_task_off_dead_cpu(src_cpu, p) noncritical | | put_task_struct(p); | | goto recheck | | } while_each_thread(t, p) | | write_unlock_irq(tasklist_lock) | | rq = task_rq_lock(rq-idle, flags) | | migrate_dead_tasks(cpu) | | for (arr = 0; arr 2; arr++) { | | for (i = 0; i MAX_PRIO; i++) { | | while (!list_empty(list)) | | migrate_dead(dead_cpu | | get_task_struct(p) | | spin_unlock_irq(rq-lock) | | move_task_off_dead_cpu(dead_cpu, p) noncritcal | | spin_lock_irq(rq-lock) | | put_task_struct(p) | | task_rq_unlock(rq, flags) [Side note: a task may be migrated off of its cpuset, but is still attached to that cpuset (by pointer and reference count). The cpuset will not be released. This patch does not change that.] Diffed against 2.6.21 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] kernel/sched.c | 31 --- 1 file changed, 28 insertions(+), 3 deletions(-) Index: linus.070504/kernel/sched.c === --- linus.070504.orig/kernel/sched.c +++ linus.070504/kernel/sched.c @@ -4989,7 +4989,7 @@ wait_to_die: #ifdef CONFIG_HOTPLUG_CPU /* * Figure out where task on dead CPU should go, use force if neccessary. - * NOTE: interrupts should be disabled by the caller + * NOTE: interrupts are not disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { @@ -5008,6 +5008,17 @@ restart: if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(p-cpus_allowed); + /* try to stay on the same cpuset */ + if (dest_cpu == NR_CPUS) { + /* +* Call to cpuset_cpus_allowed may sleep, so we depend +* on move_task_off_dead_cpu() being called in a non-critical +* region. +*/ + p-cpus_allowed = cpuset_cpus_allowed(p); + dest_cpu = any_online_cpu(p-cpus_allowed); + } + /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { rq = task_rq_lock(p, flags); @@ -5025,8 +5036,16 @@ restart: longer affine to cpu%d\n, p-pid, p-comm, dead_cpu); } - if (!__migrate_task(p, dead_cpu, dest_cpu)) + /* +* __migrate_task() requires interrupts to be disabled +*/ + local_irq_disable(); + if (!__migrate_task(p, dead_cpu, dest_cpu)) { + local_irq_enable(); goto restart; + } + local_irq_enable(); + return; } /* @@ -5054,14 +5073,20 @@ static void migrate_live_tasks(int src_c { struct task_struct *p, *t; +restartlist
Re: getting processor numbers
On Wed, Apr 04, 2007 at 02:47:32AM -0400, Jakub Jelinek wrote: > On Tue, Apr 03, 2007 at 07:04:58PM -0700, Paul Jackson wrote: > > There are really at least four "number of CPUs" answers here, and we > > should be aware of which we are providing. There are, in order of > > decreasing size: > > 1) the size of the kernels cpumask_t (NR_CPUS), > > 2) the maximum number of CPUs that might ever be hotplugged into a > > booted system, > > 3) the current number of CPUs online in that system, and > > 4) the number of CPUs that the current task is allowed to use. > > sysconf(_SC_NPROCESSORS_CONF) should IMHO return (2) (this currently > scans /proc/cpuinfo on alpha and sparc{,64} for ((ncpus|CPUs) probed|cpus > detected) > and for the rest just returns sysconf(_SC_NPROCESSORS_ONLN)). > Neither of the sysconf returned values should be affected by affinity. I'm looking at an ia64 system, and when a cpu is hot-unplugged it is removed from /proc/cpuinfo. Wouldn't /sys/devices/system/cpu/ be a better source for 2) ? -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: getting processor numbers
On Wed, Apr 04, 2007 at 02:47:32AM -0400, Jakub Jelinek wrote: On Tue, Apr 03, 2007 at 07:04:58PM -0700, Paul Jackson wrote: There are really at least four number of CPUs answers here, and we should be aware of which we are providing. There are, in order of decreasing size: 1) the size of the kernels cpumask_t (NR_CPUS), 2) the maximum number of CPUs that might ever be hotplugged into a booted system, 3) the current number of CPUs online in that system, and 4) the number of CPUs that the current task is allowed to use. sysconf(_SC_NPROCESSORS_CONF) should IMHO return (2) (this currently scans /proc/cpuinfo on alpha and sparc{,64} for ((ncpus|CPUs) probed|cpus detected) and for the rest just returns sysconf(_SC_NPROCESSORS_ONLN)). Neither of the sysconf returned values should be affected by affinity. I'm looking at an ia64 system, and when a cpu is hot-unplugged it is removed from /proc/cpuinfo. Wouldn't /sys/devices/system/cpu/ be a better source for 2) ? -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] cpusets/sched_domain reconciliation
Hello Andrew, On Thu, Mar 22, 2007 at 02:21:52PM -0700, Andrew Morton wrote: > On Tue, 20 Mar 2007 13:14:35 -0600 > [EMAIL PROTECTED] (Cliff Wickman) wrote: > > > This patch reconciles cpusets and sched_domains that get out of sync > > due to disabling and re-enabling of cpu's. > > I get three-out-of-three rejects in cpuset.c. I could fix them, but I > wouldn't be very confident that the result works at runtime. 2.6.20-rc6 was > a long time ago - please, always raise patches against the latest mainline > kernel (the daily git snapshot suffices). Will do. > Recursion is a big no-no in kernel. Is there any way in which it can be > avoided? Is Dinakar's implementation also recursive? I was a little reluctant to use recursion, but this use parallels another, existing such use in cpuset.c The depth of the recursion is only the depth of the cpuset hierarchy, which is set up by an administrator, and which is logically limited by the number of cpus in the system. e.g. it would be hard to even deliberately organize 16 cpus into a hierarchy greater than 16 layers deep, even if you wanted cpusets of single cpus. We've not run into such a problem on systems of hundreds of cpus. I would think it's safe. What do you think? Dinakar's solution is not written yet, as far as I know. I'll copy him for his status. -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] cpusets/sched_domain reconciliation
Hello Andrew, On Thu, Mar 22, 2007 at 02:21:52PM -0700, Andrew Morton wrote: On Tue, 20 Mar 2007 13:14:35 -0600 [EMAIL PROTECTED] (Cliff Wickman) wrote: This patch reconciles cpusets and sched_domains that get out of sync due to disabling and re-enabling of cpu's. I get three-out-of-three rejects in cpuset.c. I could fix them, but I wouldn't be very confident that the result works at runtime. 2.6.20-rc6 was a long time ago - please, always raise patches against the latest mainline kernel (the daily git snapshot suffices). Will do. Recursion is a big no-no in kernel. Is there any way in which it can be avoided? Is Dinakar's implementation also recursive? I was a little reluctant to use recursion, but this use parallels another, existing such use in cpuset.c The depth of the recursion is only the depth of the cpuset hierarchy, which is set up by an administrator, and which is logically limited by the number of cpus in the system. e.g. it would be hard to even deliberately organize 16 cpus into a hierarchy greater than 16 layers deep, even if you wanted cpusets of single cpus. We've not run into such a problem on systems of hundreds of cpus. I would think it's safe. What do you think? Dinakar's solution is not written yet, as far as I know. I'll copy him for his status. -- Cliff Wickman Silicon Graphics, Inc. [EMAIL PROTECTED] (651) 683-3824 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
Submission #2: after style changes recommended by Randy Dunlap This patch corrects a situation that occurs when one disables all the cpus in a cpuset. At that point, any tasks in that cpuset are incorrectly moved. (Disabling all cpus in a cpuset caused it to inherit the cpus of its parent, which may overlap its exclusive sibling.) Such tasks should be moved to the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be removed (if it is flagged notify_on_release). This patch contains the added complexity of taking care not to do memory allocation while holding the cpusets callback_mutex. And it makes use of the "cpuset_release_agent" to do the cpuset removals. It might be simpler to use a separate thread or workqueue. But such code has not yet been written. Diffed against 2.6.20-rc6 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 200 ++-- 1 file changed, 180 insertions(+), 20 deletions(-) Index: morton.070205/kernel/cpuset.c === --- morton.070205.orig/kernel/cpuset.c +++ morton.070205/kernel/cpuset.c @@ -112,6 +112,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +struct path_list_element { + struct list_head list; + struct cpuset *cs; + char *path; +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus * the time manage_mutex is held. */ -static void cpuset_release_agent(const char *pathbuf) +static void cpuset_release_agent(const char *pathbuf, int releasepath) { char *argv[3], *envp[3]; int i; @@ -518,7 +524,8 @@ static void cpuset_release_agent(const c envp[i] = NULL; call_usermodehelper(argv[0], argv, envp, 0); - kfree(pathbuf); + if (releasepath) + kfree(pathbuf); } /* @@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write( retval = nbytes; out2: mutex_unlock(_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); out1: kfree(buffer); return retval; @@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un if (list_empty(>children)) check_for_release(parent, ); mutex_unlock(_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); return 0; } @@ -2053,13 +2060,33 @@ out: } /* + * move every task that is a member of cpuset "from" to cpuset "to" + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + int moved=0; + struct task_struct *g, *tsk; + + read_lock(_lock); + do_each_thread(g, tsk) { + if (tsk->cpuset == from) { + moved++; + task_lock(tsk); + tsk->cpuset = to; + task_unlock(tsk); + } + } while_each_thread(g, tsk); + read_unlock(_lock); + atomic_add(moved, >count); + atomic_set(>count, 0); +} + +/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then the guarantee_online_cpus() - * or guarantee_online_mems() code will use that emptied cpusets - * parent online CPUs or nodes. Cpusets that were already empty of - * CPUs or nodes are left empty. + * last CPU or node from a cpuset, then move the tasks in the empty cpuset + * to its next-highest non-empty parent. And remove the empty cpuset. * * This routine is intentionally inefficient in a couple of regards. * It will check all cpusets in a subtree even if the top cpuset of @@ -2070,20 +2097,104 @@ out: * * Call with both manage_mutex and callback_mutex held. * + * Takes tasklist_lock, and task_lock() for cpuset members that are + * moved to another cpuset. + * + * Recursive, on depth of cpuset subtree. + */ + +static void remove_tasks_in_empty_cpusets_in_subtree( + const struct cpuset *cur, + struct list_head *empty_list, + struct path_list_element **ple_array, + int *ple_availp, int ple_count) +{ + int npids, ple_used=0; + struct cpuset *c, *parent; + struct path_list_element *ple; + + /* If a cpuset's mems or cpus are empty, move its tasks to its parent */ + list_for_each_entry(c, >children, sibling) { + remove_tasks_in_empty_cpusets_in_subtree(c, empty_list, + ple_array, ple_availp, ple_count); +
[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
Submission #2: after style changes recommended by Randy Dunlap This patch corrects a situation that occurs when one disables all the cpus in a cpuset. At that point, any tasks in that cpuset are incorrectly moved. (Disabling all cpus in a cpuset caused it to inherit the cpus of its parent, which may overlap its exclusive sibling.) Such tasks should be moved to the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be removed (if it is flagged notify_on_release). This patch contains the added complexity of taking care not to do memory allocation while holding the cpusets callback_mutex. And it makes use of the cpuset_release_agent to do the cpuset removals. It might be simpler to use a separate thread or workqueue. But such code has not yet been written. Diffed against 2.6.20-rc6 Signed-off-by: Cliff Wickman [EMAIL PROTECTED] --- kernel/cpuset.c | 200 ++-- 1 file changed, 180 insertions(+), 20 deletions(-) Index: morton.070205/kernel/cpuset.c === --- morton.070205.orig/kernel/cpuset.c +++ morton.070205/kernel/cpuset.c @@ -112,6 +112,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +struct path_list_element { + struct list_head list; + struct cpuset *cs; + char *path; +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus * the time manage_mutex is held. */ -static void cpuset_release_agent(const char *pathbuf) +static void cpuset_release_agent(const char *pathbuf, int releasepath) { char *argv[3], *envp[3]; int i; @@ -518,7 +524,8 @@ static void cpuset_release_agent(const c envp[i] = NULL; call_usermodehelper(argv[0], argv, envp, 0); - kfree(pathbuf); + if (releasepath) + kfree(pathbuf); } /* @@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write( retval = nbytes; out2: mutex_unlock(manage_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); out1: kfree(buffer); return retval; @@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un if (list_empty(parent-children)) check_for_release(parent, pathbuf); mutex_unlock(manage_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); return 0; } @@ -2053,13 +2060,33 @@ out: } /* + * move every task that is a member of cpuset from to cpuset to + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + int moved=0; + struct task_struct *g, *tsk; + + read_lock(tasklist_lock); + do_each_thread(g, tsk) { + if (tsk-cpuset == from) { + moved++; + task_lock(tsk); + tsk-cpuset = to; + task_unlock(tsk); + } + } while_each_thread(g, tsk); + read_unlock(tasklist_lock); + atomic_add(moved, to-count); + atomic_set(from-count, 0); +} + +/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then the guarantee_online_cpus() - * or guarantee_online_mems() code will use that emptied cpusets - * parent online CPUs or nodes. Cpusets that were already empty of - * CPUs or nodes are left empty. + * last CPU or node from a cpuset, then move the tasks in the empty cpuset + * to its next-highest non-empty parent. And remove the empty cpuset. * * This routine is intentionally inefficient in a couple of regards. * It will check all cpusets in a subtree even if the top cpuset of @@ -2070,20 +2097,104 @@ out: * * Call with both manage_mutex and callback_mutex held. * + * Takes tasklist_lock, and task_lock() for cpuset members that are + * moved to another cpuset. + * + * Recursive, on depth of cpuset subtree. + */ + +static void remove_tasks_in_empty_cpusets_in_subtree( + const struct cpuset *cur, + struct list_head *empty_list, + struct path_list_element **ple_array, + int *ple_availp, int ple_count) +{ + int npids, ple_used=0; + struct cpuset *c, *parent; + struct path_list_element *ple; + + /* If a cpuset's mems or cpus are empty, move its tasks to its parent */ + list_for_each_entry(c, cur-children, sibling) { + remove_tasks_in_empty_cpusets_in_subtree(c, empty_list, + ple_array, ple_availp, ple_count
[PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
From: Cliff Wickman <[EMAIL PROTECTED]> This patch corrects a situation that occurs when one disables all the cpus in a cpuset. At that point, any tasks in that cpuset are incorrectly moved (as I recall, they were move to a sibling cpuset). Such tasks should be move the parent of their current cpuset. Or if the parent cpuset has no cpus, to its parent, etc. And the empty cpuset should be removed (if it is flagged notify_on_release). This patch contains the added complexity of taking care not to do memory allocation while holding the cpusets callback_mutex. And it makes use of the "cpuset_release_agent" to do the cpuset removals. It might be simpler to use a separate thread or workqueue. But such code has not yet been written. Diffed against 2.6.20-rc6 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 200 ++-- 1 file changed, 180 insertions(+), 20 deletions(-) Index: morton.070205/kernel/cpuset.c === --- morton.070205.orig/kernel/cpuset.c +++ morton.070205/kernel/cpuset.c @@ -112,6 +112,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +struct path_list_element { + struct list_head list; + struct cpuset *cs; + char *path; +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus * the time manage_mutex is held. */ -static void cpuset_release_agent(const char *pathbuf) +static void cpuset_release_agent(const char *pathbuf, int releasepath) { char *argv[3], *envp[3]; int i; @@ -518,7 +524,8 @@ static void cpuset_release_agent(const c envp[i] = NULL; call_usermodehelper(argv[0], argv, envp, 0); - kfree(pathbuf); + if (releasepath) + kfree(pathbuf); } /* @@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write( retval = nbytes; out2: mutex_unlock(_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); out1: kfree(buffer); return retval; @@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un if (list_empty(>children)) check_for_release(parent, ); mutex_unlock(_mutex); - cpuset_release_agent(pathbuf); + cpuset_release_agent(pathbuf, 1); return 0; } @@ -2053,13 +2060,33 @@ out: } /* + * move every task that is a member of cpuset "from" to cpuset "to" + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + int moved=0; + struct task_struct *g, *tsk; + + read_lock(_lock); + do_each_thread(g, tsk) { + if (tsk->cpuset == from) { + moved++; + task_lock(tsk); + tsk->cpuset = to; + task_unlock(tsk); + } + } while_each_thread(g, tsk); + read_unlock(_lock); + atomic_add(moved, >count); + atomic_set(>count, 0); +} + +/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then the guarantee_online_cpus() - * or guarantee_online_mems() code will use that emptied cpusets - * parent online CPUs or nodes. Cpusets that were already empty of - * CPUs or nodes are left empty. + * last CPU or node from a cpuset, then move the tasks in the empty cpuset + * to its next-highest non-empty parent. And remove the empty cpuset. * * This routine is intentionally inefficient in a couple of regards. * It will check all cpusets in a subtree even if the top cpuset of @@ -2070,20 +2097,100 @@ out: * * Call with both manage_mutex and callback_mutex held. * + * Takes tasklist_lock, and task_lock() for cpuset members that are + * moved to another cpuset. + * * Recursive, on depth of cpuset subtree. */ -static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) +static void remove_tasks_in_empty_cpusets_in_subtree(const struct cpuset *cur, struct list_head *empty_list, struct path_list_element **ple_array, int *ple_availp, int ple_count) +{ + int npids, ple_used=0; + struct cpuset *c, *parent; + struct path_list_element *ple; + + /* If a cpuset's mems or cpus are empty, move its tasks to its parent */ + list_for_each_entry(c, >children, sibling) { + remove_tasks_in_empty_cpusets_in_subtree(c, empty_list, + ple_array, ple_availp, ple_count); + /* +* If it has no online cpus or no online mems, move its tasks +* to its next-highest non-empty paren
[PATCH 1/1] cpusets/sched_domain reconciliation
From: Cliff Wickman <[EMAIL PROTECTED]> This patch reconciles cpusets and sched_domains that get out of sync due to disabling and re-enabling of cpu's. Dinakar Guniguntala (IBM) is working on his own version of fixing this. But as of this date that fix doesn't seem to be ready. Here is an example of how the problem can occur: system of cpu's 0-31 create cpuset /x 16-31 create cpuset /x/y 16-23 all cpu_exclusive disable cpu 17 x is now16,18-31 x/y is now 16,18-23 enable cpu 17 x and x/y are unchanged to restore the cpusets: echo 16-31 > /dev/cpuset/x echo 16-23 > /dev/cpuset/x/y At the first echo, update_cpu_domains() is called for cpuset x/. The system is partitioned between: its parent, the root cpuset of 0-31, minus its children (x/ is 16-31): 0-15 and x/ (16-31), minus its children (x/y/ 16,18-23): 17,24-31 The sched_domain's for parent 0-15 are updated. The sched_domain's for current 17,24-31 are updated. But 16 has been untouched. As a result, 17's SD points to sched_group_phys[17] which is the only sched_group_phys on 17's list. It points to itself. But 16's SD points to sched_group_phys[16], which still points to sched_group_phys[17]. When cpu 16 executes find_busiest_group() it will hang on the non- circular sched_group list. This solution is to update the sched_domain's for the cpuset whose cpu's were changed and, in addition, all its children. The update_cpu_domains() will end with a (recursive) call to itself for each child. The extra sched_domain reconstruction is overhead, but only at the frequency of administrative change to the cpusets. This patch also includes checks in find_busiest_group() and find_idlest_group() that break from their loops on a sched_group that points to itself. This is needed because other cpu's are going through load balancing while the sched_domains are being reconstructed. There seems to be no administrative procedural work-around. In the example above one could not reverse the two echo's and set x/y before x/. It is not logical, so not allowed (Permission denied). Diffed against 2.6.20-rc6 Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]> --- kernel/cpuset.c | 11 +-- kernel/sched.c | 19 +++ 2 files changed, 24 insertions(+), 6 deletions(-) Index: morton.070205/kernel/sched.c === --- morton.070205.orig/kernel/sched.c +++ morton.070205/kernel/sched.c @@ -1201,11 +1201,14 @@ static inline unsigned long cpu_avg_load static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *this = sd->groups, *group = sd->groups; + struct sched_group *self, *prev; unsigned long min_load = ULONG_MAX, this_load = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + prev = group; + self = group; do { unsigned long load, avg_load; int local_group; @@ -1241,8 +1244,10 @@ find_idlest_group(struct sched_domain *s idlest = group; } nextgroup: + prev = self; + self = group; group = group->next; - } while (group != sd->groups); + } while (group != sd->groups && group != self && group != prev); if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -2259,7 +2264,8 @@ find_busiest_group(struct sched_domain * unsigned long *imbalance, enum idle_type idle, int *sd_idle, cpumask_t *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *busiest = NULL, *this = sd->groups, *group = sd->groups; + struct sched_group *self, *prev; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; @@ -2282,6 +2288,8 @@ find_busiest_group(struct sched_domain * else load_idx = sd->idle_idx; + prev = group; + self = group; do { unsigned long load, group_capacity; int local_group; @@ -2410,8 +2418,11 @@ find_busiest_group(struct sched_domain * } group_next: #endif + prev = self; + self = group; group = group->next; - } while (group != sd->groups); + /* careful, a printk here can cause a spinlock hang */ + } while (group != sd->groups && group != self &am