* Christoph Lameter <[EMAIL PROTECTED]> wrote:

> This patchset allows the use of x86_32 percpu ops on x86_64 while 
> maintaining %gs pointing to the pda. It does that by moving the x86_64 
> pda into the percpu area (thereby pointing %gs at the per cpu area) 
> and then relocating the x86_64 per cpu variables to start at 0.
> 
> Patch applies on top of the per cpu cleanup patches V2. See 
> http://marc.info/?l=linux-kernel&m=119628478316525&w=2
> 
> Ultimately I think we can make the per cpu accessors arch independent 
> (see the RFC at 
> http://marc.info/?l=linux-kernel&m=119552126330405&w=2). There is a 
> performance benefit from using these in core code.

i've picked up your cleanup series and this relocation series as well, 
but it crashed on x86 32-bit UP, with:

 [   78.692936] Freeing unused kernel memory: 656k freed
 [   78.697750] BUG: spinlock wrong owner on CPU#0, /0�

(no other messages, hard lockup)

find below my manual merge against x86.git, maybe i made some merging 
mistake. x86.git can be picked up via:

 git-pull git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git mm

        Ingo

-------------------->
Index: linux-x86.q/arch/ia64/Kconfig
===================================================================
--- linux-x86.q.orig/arch/ia64/Kconfig
+++ linux-x86.q/arch/ia64/Kconfig
@@ -75,6 +75,9 @@ config GENERIC_TIME_VSYSCALL
        bool
        default y
 
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool y
+
 config DMI
        bool
        default y
Index: linux-x86.q/arch/powerpc/Kconfig
===================================================================
--- linux-x86.q.orig/arch/powerpc/Kconfig
+++ linux-x86.q/arch/powerpc/Kconfig
@@ -42,6 +42,9 @@ config GENERIC_HARDIRQS
        bool
        default y
 
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool PPC64
+
 config IRQ_PER_CPU
        bool
        default y
Index: linux-x86.q/arch/sparc64/Kconfig
===================================================================
--- linux-x86.q.orig/arch/sparc64/Kconfig
+++ linux-x86.q/arch/sparc64/Kconfig
@@ -66,6 +66,9 @@ config AUDIT_ARCH
        bool
        default y
 
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool y
+
 config ARCH_NO_VIRT_TO_BUS
        def_bool y
 
Index: linux-x86.q/arch/sparc64/mm/init.c
===================================================================
--- linux-x86.q.orig/arch/sparc64/mm/init.c
+++ linux-x86.q/arch/sparc64/mm/init.c
@@ -1323,6 +1323,11 @@ pgd_t swapper_pg_dir[2048];
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
 
+/* Dummy function */
+void __init setup_per_cpu_areas(void)
+{
+}
+
 void __init paging_init(void)
 {
        unsigned long end_pfn, pages_avail, shift, phys_base;
Index: linux-x86.q/arch/x86/Kconfig
===================================================================
--- linux-x86.q.orig/arch/x86/Kconfig
+++ linux-x86.q/arch/x86/Kconfig
@@ -116,9 +116,13 @@ config GENERIC_TIME_VSYSCALL
        bool
        default X86_64
 
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool X86_64
 
-
-
+config PERCPU_ZERO_BASED
+       bool
+       depends on X86_64 && SMP
+       default y
 
 config ZONE_DMA32
        bool
Index: linux-x86.q/arch/x86/kernel/head64.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/head64.c
+++ linux-x86.q/arch/x86/kernel/head64.c
@@ -22,6 +22,12 @@
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 
+/*
+ * Only used before the per cpu areas are setup. The use for the non possible
+ * cpus continues after boot
+ */
+static struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+
 static void __init zap_identity_mappings(void)
 {
        pgd_t *pgd = pgd_offset_k(0UL);
Index: linux-x86.q/arch/x86/kernel/setup64.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/setup64.c
+++ linux-x86.q/arch/x86/kernel/setup64.c
@@ -30,7 +30,9 @@ cpumask_t cpu_initialized __cpuinitdata 
 
 struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+
+DEFINE_PER_CPU_FIRST(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
@@ -109,10 +111,15 @@ void __init setup_per_cpu_areas(void)
                }
                if (!ptr)
                        panic("Cannot allocate cpu data for CPU %d\n", i);
-               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+               memcpy(ptr, __per_cpu_load, __per_cpu_size);
+               /* Relocate the pda */
+               memcpy(ptr, cpu_pda(i), sizeof(struct x8664_pda));
+               cpu_pda(i) = (struct x8664_pda *)ptr;
+               cpu_pda(i)->data_offset = (unsigned long)ptr;
        }
-} 
+       /* Fix up pda for this processor .... */
+       pda_init(0);
+}
 
 void pda_init(int cpu)
 { 
Index: linux-x86.q/arch/x86/kernel/smpboot_64.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/smpboot_64.c
+++ linux-x86.q/arch/x86/kernel/smpboot_64.c
@@ -556,22 +556,6 @@ static int __cpuinit do_boot_cpu(int cpu
                return -1;
        }
 
-       /* Allocate node local memory for AP pdas */
-       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-               struct x8664_pda *newpda, *pda;
-               int node = cpu_to_node(cpu);
-               pda = cpu_pda(cpu);
-               newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
-                                     node);
-               if (newpda) {
-                       memcpy(newpda, pda, sizeof (struct x8664_pda));
-                       cpu_pda(cpu) = newpda;
-               } else
-                       printk(KERN_ERR
-               "Could not allocate node local PDA for CPU %d on node %d\n",
-                               cpu, node);
-       }
-
        alternatives_smp_switch(1);
 
        c_idle.idle = get_idle_for_cpu(cpu);
Index: linux-x86.q/arch/x86/kernel/vmlinux_64.lds.S
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/vmlinux_64.lds.S
+++ linux-x86.q/arch/x86/kernel/vmlinux_64.lds.S
@@ -16,6 +16,7 @@ jiffies_64 = jiffies;
 _proxy_pda = 1;
 PHDRS {
        text PT_LOAD FLAGS(5);  /* R_E */
+       percpu PT_LOAD FLAGS(4);        /* R__ */
        data PT_LOAD FLAGS(7);  /* RWE */
        user PT_LOAD FLAGS(7);  /* RWE */
        data.init PT_LOAD FLAGS(7);     /* RWE */
Index: linux-x86.q/include/asm-generic/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-generic/percpu.h
+++ linux-x86.q/include/asm-generic/percpu.h
@@ -3,28 +3,65 @@
 #include <linux/compiler.h>
 #include <linux/threads.h>
 
-#define __GENERIC_PER_CPU
+/*
+ * Determine the real variable name from the name visible in the
+ * kernel sources.
+ */
+#define per_cpu_var(var) per_cpu__##var
+
 #ifdef CONFIG_SMP
 
+/*
+ * per_cpu_offset() is the offset that has to be added to a
+ * percpu variable to get to the instance for a certain processor.
+ *
+ * Most arches use the __per_cpu_offset array for those offsets but
+ * some arches have their own ways of determining the offset (x86_64, s390).
+ */
+#ifndef __per_cpu_offset
 extern unsigned long __per_cpu_offset[NR_CPUS];
-
 #define per_cpu_offset(x) (__per_cpu_offset[x])
+#endif
 
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) 
per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
-#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id())
+/*
+ * Determine the offset for the currently active processor.
+ * An arch may define __my_cpu_offset to provide a more effective
+ * means of obtaining the offset to the per cpu variables of the
+ * current processor.
+ */
+#ifndef __my_cpu_offset
+#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
+#define my_cpu_offset per_cpu_offset(smp_processor_id())
+#else
+#define my_cpu_offset __my_cpu_offset
+#endif
+
+/*
+ * Add a offset to a pointer but keep the pointer as is.
+ *
+ * Only S390 provides its own means of moving the pointer.
+ */
+#ifndef SHIFT_PTR
+#ifdef CONFIG_PERCPU_ZERO_BASED
+#define SHIFT_PTR(__p, __offset) \
+       ((__typeof(__p))(((void *)(__p)) + (__offset)))
+#else
+#define SHIFT_PTR(__p, __offset)       RELOC_HIDE((__p), (__offset))
+#endif /* CONFIG_PER_CPU_ZERO_BASED */
+#endif /* SHIFT_PTR */
+
+/*
+ * A percpu variable may point to a discarded reghions. The following are
+ * established ways to produce a usable pointer from the percpu variable
+ * offset.
+ */
+#define per_cpu(var, cpu) (*SHIFT_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
+#define __get_cpu_var(var) (*SHIFT_PTR(&per_cpu_var(var), my_cpu_offset))
+#define __raw_get_cpu_var(var) (*SHIFT_PTR(&per_cpu_var(var), __my_cpu_offset))
+
+#ifdef CONFIG_ARCH_SETS_UP_PER_CPU_AREA
+extern void setup_per_cpu_areas(void);
+#endif
 
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)                     \
@@ -36,21 +73,17 @@ do {                                                        
        \
 } while (0)
 #else /* ! SMP */
 
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
-#define per_cpu(var, cpu)                      (*((void)(cpu), 
&per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
+#define per_cpu(var, cpu)                      (*((void)(cpu), 
&per_cpu_var(var)))
+#define __get_cpu_var(var)                     per_cpu_var(var)
+#define __raw_get_cpu_var(var)                 per_cpu_var(var)
 
 #endif /* SMP */
 
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#ifndef PER_CPU_ATTRIBUTES
+#define PER_CPU_ATTRIBUTES
+#endif
 
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
+                                       __typeof__(type) per_cpu_var(name)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
Index: linux-x86.q/include/asm-generic/sections.h
===================================================================
--- linux-x86.q.orig/include/asm-generic/sections.h
+++ linux-x86.q/include/asm-generic/sections.h
@@ -11,7 +11,17 @@ extern char _sinittext[], _einittext[];
 extern char _sextratext[] __attribute__((weak));
 extern char _eextratext[] __attribute__((weak));
 extern char _end[];
+#ifdef CONFIG_PERCPU_ZERO_BASED
+extern char __per_cpu_load[];
+extern char ____per_cpu_size[];
+#define __per_cpu_size ((unsigned long)&____per_cpu_size)
+#define __per_cpu_start ((char *)0)
+#define __per_cpu_end ((char *)__per_cpu_size)
+#else
 extern char __per_cpu_start[], __per_cpu_end[];
+#define __per_cpu_load __per_cpu_start
+#define __per_cpu_size (__per_cpu_end - __per_cpu_start)
+#endif
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
Index: linux-x86.q/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-x86.q.orig/include/asm-generic/vmlinux.lds.h
+++ linux-x86.q/include/asm-generic/vmlinux.lds.h
@@ -255,11 +255,27 @@
        *(.initcall7.init)                                              \
        *(.initcall7s.init)
 
+#ifdef CONFIG_PERCPU_ZERO_BASED
+#define PERCPU(align)                                                  \
+       . = ALIGN(align);                                               \
+       percpu : { } :percpu                                            \
+       __per_cpu_load = .;                                             \
+       .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) {             \
+               *(.data.percpu.first)                                   \
+               *(.data.percpu)                                         \
+               *(.data.percpu.shared_aligned)                          \
+               ____per_cpu_size = .;                                   \
+       }                                                               \
+       . = __per_cpu_load + ____per_cpu_size;                          \
+       data : { } :data
+#else
 #define PERCPU(align)                                                  \
        . = ALIGN(align);                                               \
        __per_cpu_start = .;                                            \
        .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {          \
+               *(.data.percpu.first)                                   \
                *(.data.percpu)                                         \
                *(.data.percpu.shared_aligned)                          \
        }                                                               \
        __per_cpu_end = .;
+#endif
Index: linux-x86.q/include/asm-ia64/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-ia64/percpu.h
+++ linux-x86.q/include/asm-ia64/percpu.h
@@ -12,31 +12,10 @@
 # define THIS_CPU(var) (per_cpu__##var)  /* use this to mark accesses to 
per-CPU variables... */
 #else /* !__ASSEMBLY__ */
 
-
 #include <linux/threads.h>
 
 #ifdef HAVE_MODEL_SMALL_ATTRIBUTE
-# define __SMALL_ADDR_AREA     __attribute__((__model__ (__small__)))
-#else
-# define __SMALL_ADDR_AREA
-#endif
-
-#define DECLARE_PER_CPU(type, name)                            \
-       extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name)                             \
-       __attribute__((__section__(".data.percpu")))            \
-       __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
-
-#ifdef CONFIG_SMP
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
-       __attribute__((__section__(".data.percpu.shared_aligned")))     \
-       __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name              \
-       ____cacheline_aligned_in_smp
-#else
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-       DEFINE_PER_CPU(type, name)
+# define PER_CPU_ATTRIBUTES    __attribute__((__model__ (__small__)))
 #endif
 
 /*
@@ -45,39 +24,29 @@
  */
 #ifdef CONFIG_SMP
 
-extern unsigned long __per_cpu_offset[NR_CPUS];
-#define per_cpu_offset(x) (__per_cpu_offset[x])
-
-/* Equal to __per_cpu_offset[smp_processor_id()], but faster to access: */
-DECLARE_PER_CPU(unsigned long, local_per_cpu_offset);
-
-#define per_cpu(var, cpu)  (*RELOC_HIDE(&per_cpu__##var, 
__per_cpu_offset[cpu]))
-#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, 
__ia64_per_cpu_var(local_per_cpu_offset)))
-#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, 
__ia64_per_cpu_var(local_per_cpu_offset)))
+#define __my_cpu_offset        __ia64_per_cpu_var(local_per_cpu_offset)
 
 extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size);
-extern void setup_per_cpu_areas (void);
 extern void *per_cpu_init(void);
 
 #else /* ! SMP */
 
-#define per_cpu(var, cpu)                      (*((void)(cpu), 
&per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
 #define per_cpu_init()                         (__phys_per_cpu_start)
 
 #endif /* SMP */
 
-#define EXPORT_PER_CPU_SYMBOL(var)             EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var)         
EXPORT_SYMBOL_GPL(per_cpu__##var)
-
 /*
  * Be extremely careful when taking the address of this variable!  Due to 
virtual
  * remapping, it is different from the canonical address returned by 
__get_cpu_var(var)!
  * On the positive side, using __ia64_per_cpu_var() instead of __get_cpu_var() 
is slightly
  * more efficient.
  */
-#define __ia64_per_cpu_var(var)        (per_cpu__##var)
+#define __ia64_per_cpu_var(var)        per_cpu__##var
+
+#include <asm-generic/percpu.h>
+
+/* Equal to __per_cpu_offset[smp_processor_id()], but faster to access: */
+DECLARE_PER_CPU(unsigned long, local_per_cpu_offset);
 
 #endif /* !__ASSEMBLY__ */
 
Index: linux-x86.q/include/asm-powerpc/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-powerpc/percpu.h
+++ linux-x86.q/include/asm-powerpc/percpu.h
@@ -16,20 +16,6 @@
 #define __my_cpu_offset() get_paca()->data_offset
 #define per_cpu_offset(x) (__per_cpu_offset(x))
 
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) 
per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
-#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
-#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, 
local_paca->data_offset))
-
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)                     \
 do {                                                           \
@@ -39,28 +25,7 @@ do {                                                         
\
                       (src), (size));                          \
 } while (0)
 
-extern void setup_per_cpu_areas(void);
-
-#else /* ! SMP */
-
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
-#define per_cpu(var, cpu)                      (*((void)(cpu), 
&per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
-
 #endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
-#else
-#include <asm-generic/percpu.h>
 #endif
-
+#include <asm-generic/percpu.h>
 #endif /* _ASM_POWERPC_PERCPU_H_ */
Index: linux-x86.q/include/asm-s390/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-s390/percpu.h
+++ linux-x86.q/include/asm-s390/percpu.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/lowcore.h>
 
-#define __GENERIC_PER_CPU
-
 /*
  * s390 uses its own implementation for per cpu data, the offset of
  * the cpu local data area is cached in the cpu's lowcore memory.
@@ -15,41 +13,24 @@
  */
 #if defined(__s390x__) && defined(MODULE)
 
-#define __reloc_hide(var,offset) (*({                  \
+#define SHIFT_PTR(ptr,offset) (({                      \
        extern int simple_identifier_##var(void);       \
        unsigned long *__ptr;                           \
-       asm ( "larl %0,per_cpu__"#var"@GOTENT"          \
-           : "=a" (__ptr) : "X" (per_cpu__##var) );    \
-       (typeof(&per_cpu__##var))((*__ptr) + (offset)); }))
+       asm ( "larl %0, [EMAIL PROTECTED]"              \
+           : "=a" (__ptr) : "X" (ptr) );               \
+       (typeof(ptr))((*__ptr) + (offset));     }))
 
 #else
 
-#define __reloc_hide(var, offset) (*({                         \
+#define SHIFT_PTR(ptr, offset) (({                             \
        extern int simple_identifier_##var(void);               \
        unsigned long __ptr;                                    \
-       asm ( "" : "=a" (__ptr) : "0" (&per_cpu__##var) );      \
-       (typeof(&per_cpu__##var)) (__ptr + (offset)); }))
+       asm ( "" : "=a" (__ptr) : "0" (ptr) );                  \
+       (typeof(ptr)) (__ptr + (offset)); }))
 
 #endif
 
-#ifdef CONFIG_SMP
-
-extern unsigned long __per_cpu_offset[NR_CPUS];
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) \
-    __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-#define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
-#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
-#define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
-#define per_cpu_offset(x) (__per_cpu_offset[x])
+#define __my_cpu_offset S390_lowcore.percpu_offset
 
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)                     \
@@ -60,22 +41,6 @@ do {                                                         
\
                       (src), (size));                          \
 } while (0)
 
-#else /* ! SMP */
-
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
-#define __get_cpu_var(var) __reloc_hide(var,0)
-#define __raw_get_cpu_var(var) __reloc_hide(var,0)
-#define per_cpu(var,cpu) __reloc_hide(var,0)
-
-#endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#include <asm-generic/percpu.h>
 
 #endif /* __ARCH_S390_PERCPU__ */
Index: linux-x86.q/include/asm-sparc64/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-sparc64/percpu.h
+++ linux-x86.q/include/asm-sparc64/percpu.h
@@ -7,7 +7,6 @@ register unsigned long __local_per_cpu_o
 
 #ifdef CONFIG_SMP
 
-#define setup_per_cpu_areas()                  do { } while (0)
 extern void real_setup_per_cpu_areas(void);
 
 extern unsigned long __per_cpu_base;
@@ -16,15 +15,6 @@ extern unsigned long __per_cpu_shift;
        (__per_cpu_base + ((unsigned long)(__cpu) << __per_cpu_shift))
 #define per_cpu_offset(x) (__per_cpu_offset(x))
 
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) 
per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, 
__local_per_cpu_offset))
@@ -41,10 +31,6 @@ do {                                                         
\
 #else /* ! SMP */
 
 #define real_setup_per_cpu_areas()             do { } while (0)
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
 
 #define per_cpu(var, cpu)                      (*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)                     per_cpu__##var
@@ -54,7 +40,4 @@ do {                                                          
\
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
 #endif /* __ARCH_SPARC64_PERCPU__ */
Index: linux-x86.q/include/asm-x86/pda.h
===================================================================
--- linux-x86.q.orig/include/asm-x86/pda.h
+++ linux-x86.q/include/asm-x86/pda.h
@@ -39,7 +39,6 @@ struct x8664_pda {
 } ____cacheline_aligned_in_smp;
 
 extern struct x8664_pda *_cpu_pda[];
-extern struct x8664_pda boot_cpu_pda[];
 extern void pda_init(int);
 
 #define cpu_pda(i) (_cpu_pda[i])
Index: linux-x86.q/include/asm-x86/percpu.h
===================================================================
--- linux-x86.q.orig/include/asm-x86/percpu.h
+++ linux-x86.q/include/asm-x86/percpu.h
@@ -1,5 +1,152 @@
-#ifdef CONFIG_X86_32
-# include "percpu_32.h"
+#ifndef _ASM_X86_PERCPU_H_
+#define _ASM_X86_PERCPU_H_
+
+#ifdef CONFIG_X86_64
+#include <linux/compiler.h>
+
+/* Same as asm-generic/percpu.h, except that we store the per cpu offset
+   in the PDA. Longer term the PDA and every per cpu variable
+   should be just put into a single section and referenced directly
+   from %gs */
+
+#ifdef CONFIG_SMP
+#include <asm/pda.h>
+
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+#define __my_cpu_offset read_pda(data_offset)
+
+#define per_cpu_offset(x) (__per_cpu_offset(x))
+
+#define __percpu_seg "%%gs:"
+
 #else
-# include "percpu_64.h"
+
+#define __percpu_seg ""
+
 #endif
+#include <asm-generic/percpu.h>
+
+DECLARE_PER_CPU(struct x8664_pda, pda);
+
+#else /* CONFIG_X86_64 */
+
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg)                              \
+       movl %fs:per_cpu__##this_cpu_off, reg;          \
+       lea per_cpu__##var(reg), reg
+#define PER_CPU_VAR(var)       %fs:per_cpu__##var
+#else /* ! SMP */
+#define PER_CPU(var, reg)                      \
+       movl $per_cpu__##var, reg
+#define PER_CPU_VAR(var)       per_cpu__##var
+#endif /* SMP */
+
+#else /* ...!ASSEMBLY */
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    cpu - 32bit register containing the current CPU number
+ *
+ * The resulting address is stored in the "cpu" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+
+#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+
+/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
+#define __percpu_seg "%%fs:"
+
+#else  /* !SMP */
+
+#define __percpu_seg ""
+
+#endif /* SMP */
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op,var,val)                               \
+       do {                                                    \
+               typedef typeof(var) T__;                        \
+               if (0) { T__ tmp__; tmp__ = (val); }            \
+               switch (sizeof(var)) {                          \
+               case 1:                                         \
+                       asm(op "b %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               case 2:                                         \
+                       asm(op "w %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               case 4:                                         \
+                       asm(op "l %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               default: __bad_percpu_size();                   \
+               }                                               \
+       } while (0)
+
+#define percpu_from_op(op,var)                                 \
+       ({                                                      \
+               typeof(var) ret__;                              \
+               switch (sizeof(var)) {                          \
+               case 1:                                         \
+                       asm(op "b "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               case 2:                                         \
+                       asm(op "w "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               case 4:                                         \
+                       asm(op "l "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               default: __bad_percpu_size();                   \
+               }                                               \
+               ret__; })
+
+#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
+#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
+#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
+#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
+#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_PERCPU_H_ */
Index: linux-x86.q/include/asm-x86/percpu_32.h
===================================================================
--- linux-x86.q.orig/include/asm-x86/percpu_32.h
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef __ARCH_I386_PERCPU__
-#define __ARCH_I386_PERCPU__
-
-#ifdef __ASSEMBLY__
-
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    reg - 32bit register
- *
- * The resulting address is stored in the "reg" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-#define PER_CPU(var, reg)                              \
-       movl %fs:per_cpu__##this_cpu_off, reg;          \
-       lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var)       %fs:per_cpu__##var
-#else /* ! SMP */
-#define PER_CPU(var, reg)                      \
-       movl $per_cpu__##var, reg
-#define PER_CPU_VAR(var)       per_cpu__##var
-#endif /* SMP */
-
-#else /* ...!ASSEMBLY */
-
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-/* Same as generic implementation except for optimized local access. */
-#define __GENERIC_PER_CPU
-
-/* This is used for other cpus to find our section. */
-extern unsigned long __per_cpu_offset[];
-
-#define per_cpu_offset(x) (__per_cpu_offset[x])
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) 
per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_indentifier_##var(void);      \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
-
-#define __raw_get_cpu_var(var) (*({                                    \
-       extern int simple_indentifier_##var(void);                      \
-       RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off));     \
-}))
-
-#define __get_cpu_var(var) __raw_get_cpu_var(var)
-
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)                     \
-do {                                                           \
-       unsigned int __i;                                       \
-       for_each_possible_cpu(__i)                              \
-               memcpy((pcpudst)+__per_cpu_offset[__i],         \
-                      (src), (size));                          \
-} while (0)
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
-#define __percpu_seg "%%fs:"
-#else  /* !SMP */
-#include <asm-generic/percpu.h>
-#define __percpu_seg ""
-#endif /* SMP */
-
-/* For arch-specific code, we can use direct single-insn ops (they
- * don't give an lvalue though). */
-extern void __bad_percpu_size(void);
-
-#define percpu_to_op(op,var,val)                               \
-       do {                                                    \
-               typedef typeof(var) T__;                        \
-               if (0) { T__ tmp__; tmp__ = (val); }            \
-               switch (sizeof(var)) {                          \
-               case 1:                                         \
-                       asm(op "b %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               case 2:                                         \
-                       asm(op "w %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               case 4:                                         \
-                       asm(op "l %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               default: __bad_percpu_size();                   \
-               }                                               \
-       } while (0)
-
-#define percpu_from_op(op,var)                                 \
-       ({                                                      \
-               typeof(var) ret__;                              \
-               switch (sizeof(var)) {                          \
-               case 1:                                         \
-                       asm(op "b "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               case 2:                                         \
-                       asm(op "w "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               case 4:                                         \
-                       asm(op "l "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               default: __bad_percpu_size();                   \
-               }                                               \
-               ret__; })
-
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ARCH_I386_PERCPU__ */
Index: linux-x86.q/include/asm-x86/percpu_64.h
===================================================================
--- linux-x86.q.orig/include/asm-x86/percpu_64.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ASM_X8664_PERCPU_H_
-#define _ASM_X8664_PERCPU_H_
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
-   in the PDA. Longer term the PDA and every per cpu variable
-   should be just put into a single section and referenced directly
-   from %gs */
-
-#ifdef CONFIG_SMP
-
-#include <asm/pda.h>
-
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset() read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) 
per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_internodealigned_in_smp
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)); }))
-#define __get_cpu_var(var) (*({                                \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
-#define __raw_get_cpu_var(var) (*({                    \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
-
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)                     \
-do {                                                           \
-       unsigned int __i;                                       \
-       for_each_possible_cpu(__i)                              \
-               memcpy((pcpudst)+__per_cpu_offset(__i),         \
-                      (src), (size));                          \
-} while (0)
-
-extern void setup_per_cpu_areas(void);
-
-#else /* ! SMP */
-
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
-#define per_cpu(var, cpu)                      (*((void)(cpu), 
&per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
-
-#endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
-#endif /* _ASM_X8664_PERCPU_H_ */
Index: linux-x86.q/include/linux/percpu.h
===================================================================
--- linux-x86.q.orig/include/linux/percpu.h
+++ linux-x86.q/include/linux/percpu.h
@@ -9,6 +9,27 @@
 
 #include <asm/percpu.h>
 
+#define DEFINE_PER_CPU(type, name)                                     \
+       __attribute__((__section__(".data.percpu")))                    \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
+       __attribute__((__section__(".data.percpu.shared_aligned")))     \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name             \
+       ____cacheline_aligned_in_smp
+#else
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                    \
+       DEFINE_PER_CPU(type, name)
+#endif
+
+#define DEFINE_PER_CPU_FIRST(type, name)                               \
+       __attribute__((__section__(".data.percpu.first")))              \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
 #ifdef CONFIG_MODULES
Index: linux-x86.q/init/main.c
===================================================================
--- linux-x86.q.orig/init/main.c
+++ linux-x86.q/init/main.c
@@ -363,28 +363,29 @@ static inline void smp_prepare_cpus(unsi
 
 #else
 
-#ifdef __GENERIC_PER_CPU
+#ifndef CONFIG_ARCH_SETS_UP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 
 EXPORT_SYMBOL(__per_cpu_offset);
 
 static void __init setup_per_cpu_areas(void)
 {
-       unsigned long size, i;
-       char *ptr;
-       unsigned long nr_possible_cpus = num_possible_cpus();
+       unsigned long size;
+       int cpu;
 
        /* Copy section for each CPU (we discard the original) */
        size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-       ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
-       for_each_possible_cpu(i) {
-               __per_cpu_offset[i] = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-               ptr += size;
+       for_each_possible_cpu(cpu) {
+               char *ptr;
+
+               ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(cpu)),
+                                                               size);
+               __per_cpu_offset[cpu] = ptr - __per_cpu_start;
+               memcpy(ptr, __per_cpu_load, __per_cpu_size);
        }
 }
-#endif /* !__GENERIC_PER_CPU */
+#endif /* CONFIG_ARCH_SETS_UP_CPU_AREA */
 
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
Index: linux-x86.q/kernel/lockdep.c
===================================================================
--- linux-x86.q.orig/kernel/lockdep.c
+++ linux-x86.q/kernel/lockdep.c
@@ -613,8 +613,8 @@ static int static_obj(void *obj)
         * percpu var?
         */
        for_each_possible_cpu(i) {
-               start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-               end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+               start = (unsigned long) __per_cpu_start + per_cpu_offset(i);
+               end   = (unsigned long) __per_cpu_start + PERCPU_ENOUGH_ROOM
                                        + per_cpu_offset(i);
 
                if ((addr >= start) && (addr < end))
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to