This patch ports the x86-specific atomic overflow handling from PaX's
PAX_REFCOUNT to the upstream refcount_t API. This is an updated version
from PaX that eliminates the saturation race condition by resetting the
atomic counter back to the INT_MAX saturation value on both overflow and
underflow. To win a race, a system would have to have INT_MAX threads
simultaneously overflow before the saturation handler runs.

With this, the commonly used inc/dec_and_test usage patterns present
in performance-sensitive areas of the kernel (mm, net, block) will
use the regular inline atomic operations with only a single overflow
test instruction added to the fast path.

Signed-off-by: Kees Cook <keesc...@chromium.org>
---
 arch/Kconfig                       |  19 ++++++
 arch/x86/Kconfig                   |   1 +
 arch/x86/entry/entry_32.S          |   9 +++
 arch/x86/entry/entry_64.S          |   3 +
 arch/x86/include/asm/irq_vectors.h |   3 +
 arch/x86/include/asm/refcount.h    | 123 +++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/traps.h       |   5 ++
 arch/x86/kernel/traps.c            |  38 ++++++++++++
 drivers/misc/lkdtm_bugs.c          |  19 ++++--
 include/asm-generic/sections.h     |   4 ++
 include/asm-generic/vmlinux.lds.h  |   9 +++
 include/linux/kernel.h             |   2 +
 include/linux/refcount.h           |   4 ++
 kernel/panic.c                     |  23 +++++++
 lib/refcount.c                     |   6 +-
 15 files changed, 263 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/include/asm/refcount.h

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..2cd150f03175 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -847,4 +847,23 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
        bool
 
+config ARCH_HAS_FAST_REFCOUNT
+       bool
+       help
+         An architecture selects this when it has implemented refcount_t
+         using primitizes that provide a faster runtime at the expense
+         of some refcount state checks. The refcount overflow condition,
+         however, must be retained. Catching overflows is the primary
+         security concern for protecting against bugs in reference counts.
+
+config FAST_REFCOUNT
+       bool "Speed up reference counting at the expense of full validation"
+       depends on ARCH_HAS_FAST_REFCOUNT
+       help
+         The regular reference counting infrastructure in the kernel checks
+         many error conditions. If this option is selected, refcounting
+         is made faster using architecture-specific implementions that may
+         only check for reference count overflows (which is the primary
+         way reference counting bugs are turned into security exploits).
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..a13db97e0d71 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FAST_MULTIPLIER
+       select ARCH_HAS_FAST_REFCOUNT
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_KCOV                    if X86_64
        select ARCH_HAS_MMIO_FLUSH
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 57f7ec35216e..9e8d9e2d70bf 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -792,6 +792,15 @@ ENTRY(spurious_interrupt_bug)
        jmp     common_exception
 END(spurious_interrupt_bug)
 
+#ifdef CONFIG_FAST_REFCOUNT
+ENTRY(refcount_error)
+       ASM_CLAC
+       pushl   $0
+       pushl   $do_refcount_error
+       jmp     error_code
+ENDPROC(refcount_error)
+#endif
+
 #ifdef CONFIG_XEN
 ENTRY(xen_hypervisor_callback)
        pushl   $-1                             /* orig_ax = -1 => not a system 
call */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18ebc43c..a736b882ec76 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -858,6 +858,9 @@ idtentry coprocessor_error          do_coprocessor_error    
        has_error_code=0
 idtentry alignment_check               do_alignment_check              
has_error_code=1
 idtentry simd_coprocessor_error                do_simd_coprocessor_error       
has_error_code=0
 
+#ifdef CONFIG_FAST_REFCOUNT
+idtentry refcount_error                        do_refcount_error               
has_error_code=0
+#endif
 
        /*
         * Reload gs selector with exception handling
diff --git a/arch/x86/include/asm/irq_vectors.h 
b/arch/x86/include/asm/irq_vectors.h
index 6ca9fd6234e1..64ca4dcc29ec 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -48,6 +48,9 @@
 
 #define IA32_SYSCALL_VECTOR            0x80
 
+/* Refcount Overflow or Underflow Exception. */
+#define X86_REFCOUNT_VECTOR            0x81
+
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
  *   round up to the next 16-vector boundary
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..79e35981e42f
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,123 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Ported from PAX_REFCOUNT in
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+#include <asm/irq_vectors.h>
+
+#define __REFCOUNT_CHECK(size)                         \
+       "jo 111f\n"                                     \
+       ".if "__stringify(size)" == 4\n\t"              \
+       ".pushsection .text.refcount_overflow\n"        \
+       ".elseif "__stringify(size)" == -4\n\t"         \
+       ".pushsection .text.refcount_underflow\n"       \
+       ".else\n"                                       \
+       ".error \"invalid size\"\n"                     \
+       ".endif\n"                                      \
+       "111:\tlea %[counter],%%"_ASM_CX"\n\t"          \
+       "int $"__stringify(X86_REFCOUNT_VECTOR)"\n"     \
+       "222:\n\t"                                      \
+       ".popsection\n"                                 \
+       "333:\n"                                        \
+       _ASM_EXTABLE(222b, 333b)
+
+#define REFCOUNT_CHECK_OVERFLOW(size)  __REFCOUNT_CHECK(size)
+#define REFCOUNT_CHECK_UNDERFLOW(size) __REFCOUNT_CHECK(-(size))
+
+#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
+/* Use asm goto */
+#define __GEN_CHECKED_RMWcc(fullop, var, size, cc, ...)                        
\
+do {                                                                   \
+       asm_volatile_goto(fullop                                        \
+                       "\n\t"__REFCOUNT_CHECK(size)                    \
+                       ";j" #cc " %l[cc_label]"                        \
+                       : : [counter] "m" (var), ## __VA_ARGS__         \
+                       : "memory", "cc", "cx" : cc_label);             \
+       return 0;                                                       \
+cc_label:                                                              \
+       return 1;                                                       \
+} while (0)
+
+#define GEN_BINARY_CHECKED_RMWcc(op, var, size, vcon, val, arg0, cc)   \
+       __GEN_CHECKED_RMWcc(op " %1, " arg0, var, size, cc, vcon (val))
+
+#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
+#define __GEN_CHECKED_RMWcc(fullop, var, size, cc, ...)                        
\
+do {                                                                   \
+       bool c;                                                         \
+       asm volatile (fullop                                            \
+                       "\n\t"__REFCOUNT_CHECK(size)                    \
+                       ";" CC_SET(cc)                                  \
+                       : [counter] "+m" (var), CC_OUT(cc) (c)          \
+                       : __VA_ARGS__ : "memory", "cc", "cx");          \
+       return c != 0;                                                  \
+} while (0)
+
+#define GEN_BINARY_CHECKED_RMWcc(op, var, size, vcon, val, arg0, cc)   \
+       __GEN_CHECKED_RMWcc(op " %2, " arg0, var, size, cc, vcon (val))
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
+#define GEN_UNARY_CHECKED_RMWcc(op, var, size, arg0, cc)               \
+       __GEN_CHECKED_RMWcc(op " " arg0, var, size, cc)
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+       asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+               REFCOUNT_CHECK_OVERFLOW(4)
+               : [counter] "+m" (r->refs.counter)
+               : "ir" (i)
+               : "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+       asm volatile(LOCK_PREFIX "incl %0\n\t"
+               REFCOUNT_CHECK_OVERFLOW(4)
+               : [counter] "+m" (r->refs.counter)
+               : : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+       asm volatile(LOCK_PREFIX "decl %0\n\t"
+               REFCOUNT_CHECK_UNDERFLOW(4)
+               : [counter] "+m" (r->refs.counter)
+               : : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+       GEN_BINARY_CHECKED_RMWcc(LOCK_PREFIX "subl", r->refs.counter,
+                               -4, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+       GEN_UNARY_CHECKED_RMWcc(LOCK_PREFIX "decl", r->refs.counter,
+                               -4, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+       const int a = 1;
+       const int u = 0;
+       int c, old;
+
+       c = atomic_read(&(r->refs));
+       for (;;) {
+               if (unlikely(c == (u)))
+                       break;
+               old = atomic_cmpxchg(&(r->refs), c, c + (a));
+               if (likely(old == c))
+                       break;
+               c = old;
+       }
+       return c != u;
+}
+
+#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7f48cd..e4d8db75d85e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -38,6 +38,10 @@ asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
 
+#ifdef CONFIG_FAST_REFCOUNT
+asmlinkage void refcount_error(void);
+#endif
+
 #ifdef CONFIG_TRACING
 asmlinkage void trace_page_fault(void);
 #define trace_stack_segment stack_segment
@@ -54,6 +58,7 @@ asmlinkage void trace_page_fault(void);
 #define trace_alignment_check alignment_check
 #define trace_simd_coprocessor_error simd_coprocessor_error
 #define trace_async_page_fault async_page_fault
+#define trace_refcount_error refcount_error
 #endif
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4e496379a871..999d324119c0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -192,6 +192,13 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, 
char *str,
                        tsk->thread.trap_nr = trapnr;
                        die(str, regs, error_code);
                }
+
+#ifdef CONFIG_FAST_REFCOUNT
+               if (trapnr == X86_REFCOUNT_VECTOR) {
+                       regs->ip -= 2;  /* sizeof(int $xx) */
+                       refcount_error_report(regs, str);
+               }
+#endif
                return 0;
        }
 
@@ -308,6 +315,32 @@ __visible void __noreturn handle_stack_overflow(const char 
*message,
 }
 #endif
 
+#ifdef CONFIG_FAST_REFCOUNT
+
+dotraplinkage void do_refcount_error(struct pt_regs *regs, long error_code)
+{
+       const char *str = NULL;
+
+       BUG_ON(!(regs->flags & X86_EFLAGS_OF));
+
+#define range_check(size, direction, type, value) \
+       if ((unsigned long)__##size##_##direction##_start <= regs->ip && \
+           regs->ip < (unsigned long)__##size##_##direction##_end) { \
+               *(type *)regs->cx = value; \
+               str = #size " " #direction; \
+       }
+
+       range_check(refcount,   overflow,  int, INT_MAX)
+       range_check(refcount,   underflow, int, INT_MIN)
+
+#undef range_check
+
+       BUG_ON(!str);
+       do_error_trap(regs, error_code, (char *)str, X86_REFCOUNT_VECTOR,
+                     SIGILL);
+}
+#endif
+
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -983,6 +1016,11 @@ void __init trap_init(void)
        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
+#ifdef CONFIG_FAST_REFCOUNT
+       set_intr_gate(X86_REFCOUNT_VECTOR, refcount_error);
+       set_bit(X86_REFCOUNT_VECTOR, used_vectors);
+#endif
+
        /*
         * Set the IDT descriptor to a fixed read-only location, so that the
         * "sidt" instruction will not leak the location of the kernel, and
diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c
index e3f4cd8876b5..1bdafb29b802 100644
--- a/drivers/misc/lkdtm_bugs.c
+++ b/drivers/misc/lkdtm_bugs.c
@@ -135,9 +135,15 @@ void lkdtm_HUNG_TASK(void)
        schedule();
 }
 
+#ifdef CONFIG_FAST_REFCOUNT
+#define REFCOUNT_MAX   INT_MAX
+#else
+#define REFCOUNT_MAX   UINT_MAX
+#endif
+
 void lkdtm_REFCOUNT_SATURATE_INC(void)
 {
-       refcount_t over = REFCOUNT_INIT(UINT_MAX - 1);
+       refcount_t over = REFCOUNT_INIT(REFCOUNT_MAX - 1);
 
        pr_info("attempting good refcount decrement\n");
        refcount_dec(&over);
@@ -146,7 +152,7 @@ void lkdtm_REFCOUNT_SATURATE_INC(void)
        pr_info("attempting bad refcount inc overflow\n");
        refcount_inc(&over);
        refcount_inc(&over);
-       if (refcount_read(&over) == UINT_MAX)
+       if (refcount_read(&over) == REFCOUNT_MAX)
                pr_err("Correctly stayed saturated, but no BUG?!\n");
        else
                pr_err("Fail: refcount wrapped\n");
@@ -154,7 +160,7 @@ void lkdtm_REFCOUNT_SATURATE_INC(void)
 
 void lkdtm_REFCOUNT_SATURATE_ADD(void)
 {
-       refcount_t over = REFCOUNT_INIT(UINT_MAX - 1);
+       refcount_t over = REFCOUNT_INIT(REFCOUNT_MAX - 1);
 
        pr_info("attempting good refcount decrement\n");
        refcount_dec(&over);
@@ -162,7 +168,7 @@ void lkdtm_REFCOUNT_SATURATE_ADD(void)
 
        pr_info("attempting bad refcount add overflow\n");
        refcount_add(2, &over);
-       if (refcount_read(&over) == UINT_MAX)
+       if (refcount_read(&over) == REFCOUNT_MAX)
                pr_err("Correctly stayed saturated, but no BUG?!\n");
        else
                pr_err("Fail: refcount wrapped\n");
@@ -178,6 +184,11 @@ void lkdtm_REFCOUNT_ZERO_DEC(void)
                pr_err("Stayed at zero, but no BUG?!\n");
        else
                pr_err("Fail: refcount went crazy\n");
+
+       pr_info("attempting bad refcount decrement past INT_MIN\n");
+       atomic_set(&zero.refs, INT_MIN);
+       refcount_dec(&zero);
+       pr_err("Fail: wrap not detected\n");
 }
 
 void lkdtm_REFCOUNT_ZERO_SUB(void)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 532372c6cf15..0590f384f234 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -20,6 +20,8 @@
  *                   may be out of this range on some architectures.
  * [_sinittext, _einittext]: contains .init.text.* sections
  * [__bss_start, __bss_stop]: contains BSS sections
+ * [__refcount_overflow/underflow_start, ..._end]: contains .text sections
+ *                  for refcount error handling.
  *
  * Following global variables are optional and may be unavailable on some
  * architectures and/or kernel configurations.
@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], 
__per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
+extern char __refcount_overflow_start[], __refcount_overflow_end[];
+extern char __refcount_underflow_start[], __refcount_underflow_end[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 143db9c523e2..a04aae39e820 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -448,9 +448,18 @@
                ALIGN_FUNCTION();                                       \
                *(.text.hot .text .text.fixup .text.unlikely)           \
                *(.ref.text)                                            \
+               REFCOUNT_TEXT                                           \
        MEM_KEEP(init.text)                                             \
        MEM_KEEP(exit.text)                                             \
 
+#define __REFCOUNT_TEXT(section)                                       \
+               VMLINUX_SYMBOL(__##section##_start) = .;                \
+               *(.text.##section)                                      \
+               VMLINUX_SYMBOL(__##section##_end) = .;
+
+#define REFCOUNT_TEXT                                                  \
+       __REFCOUNT_TEXT(refcount_overflow)                              \
+       __REFCOUNT_TEXT(refcount_underflow)
 
 /* sched.text is aling to function alignment to secure we have same
  * address even at second ld pass when generating System.map */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c26dc3a8295..bc15822b24eb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -275,6 +275,8 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
+void refcount_error_report(struct pt_regs *regs, const char *kind);
+
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long 
*res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0023fee4bbbc..fdb82bcaf975 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -22,6 +22,9 @@ static inline unsigned int refcount_read(const refcount_t *r)
        return atomic_read(&r->refs);
 }
 
+#ifdef CONFIG_FAST_REFCOUNT
+#include <asm/refcount.h>
+#else
 extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
 extern void refcount_add(unsigned int i, refcount_t *r);
 
@@ -33,6 +36,7 @@ extern void refcount_sub(unsigned int i, refcount_t *r);
 
 extern __must_check bool refcount_dec_and_test(refcount_t *r);
 extern void refcount_dec(refcount_t *r);
+#endif
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
 extern __must_check bool refcount_dec_not_one(refcount_t *r);
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..a1745b60cc36 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
 #include <linux/nmi.h>
 #include <linux/console.h>
 #include <linux/bug.h>
+#include <linux/ratelimit.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -601,6 +602,28 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 #endif
 
+#ifdef CONFIG_FAST_REFCOUNT
+static DEFINE_RATELIMIT_STATE(refcount_ratelimit, 15 * HZ, 3);
+
+void refcount_error_report(struct pt_regs *regs, const char *kind)
+{
+       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, current, true);
+
+       if (!__ratelimit(&refcount_ratelimit))
+               return;
+
+       pr_emerg("%s detected in: %s:%d, uid/euid: %u/%u\n",
+               kind ? kind : "refcount error",
+               current->comm, task_pid_nr(current),
+               from_kuid_munged(&init_user_ns, current_uid()),
+               from_kuid_munged(&init_user_ns, current_euid()));
+       print_symbol(KERN_EMERG "refcount error occurred at: %s\n",
+               instruction_pointer(regs));
+       BUG();
+}
+EXPORT_SYMBOL(refcount_error_report);
+#endif
+
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/lib/refcount.c b/lib/refcount.c
index aa09ad3c30b0..903a59557893 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -37,6 +37,9 @@
 #include <linux/refcount.h>
 #include <linux/bug.h>
 
+/* Leave out architecture-specific implementations. */
+#ifndef CONFIG_FAST_REFCOUNT
+
 bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
        unsigned int old, new, val = atomic_read(&r->refs);
@@ -168,6 +171,8 @@ void refcount_dec(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec);
 
+#endif /* CONFIG_FAST_REFCOUNT */
+
 /*
  * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
  * success thereof.
@@ -264,4 +269,3 @@ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
        return true;
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_lock);
-
-- 
2.7.4


-- 
Kees Cook
Pixel Security

Reply via email to