The current_task is supposed to be constant in each thread and therefore
does not need to be reread. There is already an attempt to cache it
using inline assembly, using this_cpu_read_stable(), which hides the
dependency on the read memory address.

However, this caching is not working very well. For example,
sync_mm_rss() still reads current_task twice for no reason.

Allow more aggressive caching by aliasing current_task to
into a constant const_current_task and reading from the constant copy.
Doing so requires the compiler to support x86 segment qualifiers.
Hide const_current_task in a different compilation unit to avoid the
compiler from assuming that the value is constant during compilation.

Signed-off-by: Nadav Amit <na...@vmware.com>
---
 arch/x86/include/asm/current.h | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/Makefile   |  1 +
 arch/x86/kernel/cpu/common.c   |  7 +------
 arch/x86/kernel/cpu/current.c  | 16 ++++++++++++++++
 include/linux/compiler.h       |  2 +-
 5 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/current.c

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..7f093e81a647 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,11 +10,41 @@ struct task_struct;
 
 DECLARE_PER_CPU(struct task_struct *, current_task);
 
+#if USE_X86_SEG_SUPPORT
+
+/*
+ * Hold a constant alias for current_task, which would allow to avoid caching 
of
+ * current task.
+ *
+ * We must mark const_current_task with the segment qualifiers, as otherwise 
gcc
+ * would do redundant reads of const_current_task.
+ */
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, 
const_current_task);
+
+static __always_inline struct task_struct *get_current(void)
+{
+
+       /*
+        * GCC is missing functionality of removing segment qualifiers, which
+        * messes with per-cpu infrastructure that holds local copies. Use
+        * __raw_cpu_read to avoid holding any copy.
+        */
+       return __raw_cpu_read(, const_current_task);
+}
+
+#else /* USE_X86_SEG_SUPPORT */
+
+/*
+ * Without segment qualifier support, the per-cpu infrastrucutre is not
+ * suitable for reading constants, so use this_cpu_read_stable() in this case.
+ */
 static __always_inline struct task_struct *get_current(void)
 {
        return this_cpu_read_stable(current_task);
 }
 
+#endif /* USE_X86_SEG_SUPPORT */
+
 #define current get_current()
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..d816f03a37d7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,6 +19,7 @@ CFLAGS_common.o               := $(nostackp)
 
 obj-y                  := cacheinfo.o scattered.o topology.o
 obj-y                  += common.o
+obj-y                  += current.o
 obj-y                  += rdrand.o
 obj-y                  += match.o
 obj-y                  += bugs.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e0489d2860d3..33a6b51e8059 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1607,13 +1607,8 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
 EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
 
 /*
- * The following percpu variables are hot.  Align current_task to
- * cacheline size such that they fall in the same cacheline.
+ * The following percpu variables are hot.
  */
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
-       &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
 DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 
diff --git a/arch/x86/kernel/cpu/current.c b/arch/x86/kernel/cpu/current.c
new file mode 100644
index 000000000000..3238c6e34984
--- /dev/null
+++ b/arch/x86/kernel/cpu/current.c
@@ -0,0 +1,16 @@
+#include <linux/sched/task.h>
+#include <asm/current.h>
+
+/*
+ * Align current_task to cacheline size such that they fall in the same
+ * cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+       &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+#if USE_X86_SEG_SUPPORT
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, 
const_current_task)
+       __attribute__((alias("current_task")));
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f0fd5636fddb..1b6ee9ab6373 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,7 +299,7 @@ unsigned long read_word_at_a_time(const void *addr)
  */
 #define __ADDRESSABLE(sym) \
        static void * __section(".discard.addressable") __used \
-               __PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+               __PASTE(__addressable_##sym, __LINE__) = (void 
*)(uintptr_t)&sym;
 
 /**
  * offset_to_ptr - convert a relative memory offset to an absolute pointer
-- 
2.17.1

Reply via email to