Going through return thunks is unnecessary cost on hardware unaffected
by ITS. Furthermore even on affected hardware, RET is fine to have in
the upper halves of cachelines. To focus on the machinery here, start
with merely amending RET, as is used in assembly files.

memcpy() once again is special. For one, one of the RETs there is in an
alternative, the address of which doesn't matter when determining whether
re-inlining is safe on ITS-affected hardware. And then we also can't use
stringification there anymore, as that converts double quotes to
backslash-escaped forms. Those are left intact though when the assembler
parses macro operands; a sequence of two double quotes is the escaping
for what should result in a single remaining one. Fall back to the macro
approach used elsewhere.

memset() and clear_page_*() are also slightly special, in that we want
to avoid a RET to be overwritten by alternatives patching.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
---
NOTE: While technically separate, this touches quite a few of the
      places the ERMS series (v7 just sent) also touches. Therefore
      this patch is assumed to go on top of that series.
---
Is conditionally re-inlining (based on address) really a good idea? This
way we'll almost certainly have imbalances in RSB more often than not.

CS prefixes are used for pre-padding following what apply_alt_calls()
uses. Since "REP RET" is a known idiom, maybe we'd better use REP?

Using prefixes there is based on the assumption that the last byte of
the insn is what matters. If that was wrong, a NOP would need inserting
instead.

Not being able to re-inline some instance isn't really a fatal issue.
Would re_inline_ret() better return void? Or should we at least not
panic when it fails? (In either case the present error return(s) there
would need to become continue(s).)

Emitting the "re-inlined ... out of ... RETs" multiple times isn't nice,
but options I could think of didn't look very nice either.

The name of the helper macro in memset.S isn't great. Suggestions
welcome.

How to hook up the livepatch function isn't quite clear to me: Other
architectures may not have the need to do any re-inlining, yet using a
CONFIG_X86 conditional in common/livepatch.c also doesn't look very
attractive.

The changes to _apply_alternatives() could in principle be split out to
a subsequent patch (alternatives simply could retain their JMPs to
the return thunk until then). It seemed better to me though to have
everything together.

--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -249,6 +249,76 @@ static void __init seal_endbr64(void)
     printk("altcall: Optimised away %u endbr64 instructions\n", clobbered);
 }
 
+#define RE_INLINE_PTR(r) ((void *)(r) + *(r))
+
+static int init_or_livepatch re_inline_ret(
+    const int32_t *start, const int32_t *end)
+{
+    const int32_t *r;
+    unsigned int done = 0;
+    bool its_no = boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+                  cpu_has_its_no;
+
+    if ( !IS_ENABLED(CONFIG_RETURN_THUNK) )
+    {
+        ASSERT(start == end);
+        return 0;
+    }
+
+    /*
+     * If we're virtualised and can't see ITS_NO, do nothing.  We are or may
+     * migrate somewhere unsafe.
+     */
+    if ( cpu_has_hypervisor && !its_no )
+        return 0;
+
+    for ( r = start; r < end; r++ )
+    {
+        uint8_t *orig = RE_INLINE_PTR(r);
+        unsigned long offs;
+        uint8_t buf[5];
+
+        if ( *orig != 0xe9 ||
+             (long)orig + 5 + *(int32_t*)(orig + 1) !=
+             (long)__x86_return_thunk )
+            return -EILSEQ;
+
+        /* RET in .altinstr_replacement is handled elsewhere */
+        if ( orig >= (const uint8_t *)_einittext )
+            continue;
+
+        if ( its_no || ((unsigned long)orig & 0x20) )
+        {
+            /*
+             * Patching independent of address is easy: Put a RET there,
+             * followed by 4 INT3s.
+             */
+            buf[0] = 0xc3;
+            memset(buf + 1, 0xcc, 4);
+        }
+        else if ( ((unsigned long)orig & 0x3f) < 0x1c )
+            continue;
+        else
+        {
+            /*
+             * In the 5-byte area we have, shift the RET enough to be in the
+             * 2nd half of the cacheline.  Pad with CS prefixes.
+             */
+            offs = 0x20 - ((unsigned long)orig & 0x3f);
+            memset(buf, 0x2e, offs);
+            buf[offs] = 0xc3;
+            memset(buf + offs + 1, 0xcc, 4 - offs);
+        }
+
+        text_poke(orig, buf, 5);
+        ++done;
+    }
+
+    printk(XENLOG_INFO "re-inlined %u out of %zu RETs\n", done, end - start);
+
+    return 0;
+}
+
 /*
  * Replace instructions with better alternatives for this CPU type.
  * This runs before SMP is initialized to avoid SMP problems with
@@ -279,6 +349,11 @@ static int init_or_livepatch _apply_alte
         unsigned int total_len = a->orig_len + a->pad_len;
         unsigned int feat = a->cpuid & ~ALT_FLAG_NOT;
         bool inv = a->cpuid & ALT_FLAG_NOT, replace;
+        /*
+         * Re-inlined RET occurring in alternative code needs to be handled
+         * here.  This variable is set to the respective JMP, if found.
+         */
+        const void *where = NULL;
 
         if ( a->repl_len > total_len )
         {
@@ -350,18 +425,37 @@ static int init_or_livepatch _apply_alte
 
         /* 0xe8/0xe9 are relative branches; fix the offset. */
         if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
+        {
             *(int32_t *)(buf + 1) += repl - orig;
+            if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
+                 *buf == 0xe9 &&
+                 ((long)repl + 5 + *(int32_t *)(buf + 1) ==
+                  (long)__x86_return_thunk) )
+                where = orig;
+        }
         else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
                   a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
                   ((long)repl + a->repl_len +
                    *(int32_t *)(buf + a->repl_len - 4) ==
                    (long)__x86_return_thunk) )
+        {
             *(int32_t *)(buf + a->repl_len - 4) += repl - orig;
+            where = orig + a->repl_len - 5;
+        }
 
         a->priv = 1;
 
         add_nops(buf + a->repl_len, total_len - a->repl_len);
         text_poke(orig, buf, total_len);
+
+        if ( where )
+        {
+            int32_t rel = where - (void *)&rel;
+            int rc = re_inline_ret(&rel, &rel + 1);
+
+            if ( rc )
+                return rc;
+        }
     }
 
     return 0;
@@ -459,6 +553,11 @@ int apply_alternatives(struct alt_instr
     return _apply_alternatives(start, end);
 }
 
+int livepatch_re_inline(const int32_t *ret_start, const int32_t *ret_end)
+{
+    return re_inline_ret(ret_start, ret_end);
+}
+
 int livepatch_apply_alt_calls(const struct alt_call *start,
                               const struct alt_call *end)
 {
@@ -473,6 +572,7 @@ static unsigned int __initdata alt_done;
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern struct alt_call __alt_call_sites_start[], __alt_call_sites_end[];
+extern const int32_t __ret_thunk_sites_start[], __ret_thunk_sites_end[];
 
 /*
  * At boot time, we patch alternatives in NMI context.  This means that the
@@ -508,6 +608,10 @@ static int __init cf_check nmi_apply_alt
                                      __alt_instructions_end);
             if ( rc )
                 panic("Unable to apply alternatives: %d\n", rc);
+
+            rc = re_inline_ret(__ret_thunk_sites_start, __ret_thunk_sites_end);
+            if ( rc )
+                panic("Unable to re-inline RETs: %d\n", rc);
         }
 
         if ( alt_todo & ALT_CALLS )
--- a/xen/arch/x86/clear_page.S
+++ b/xen/arch/x86/clear_page.S
@@ -16,9 +16,6 @@
         add     $32, %rdi
         sub     $1, %ecx
         jnz     0b
-
-        sfence
-        RET
         .endm
 
         .macro clear_page_clzero
@@ -33,13 +30,23 @@ clear_page_clzero_post_count:
 clear_page_clzero_post_neg_size:
         sub     $1, %ecx
         jnz     0b
-
+        /*
+         * Have a (seemingly redundant) RET (and the accompanying SFENCE) here
+         * to avoid executing a longer (sequence of) NOP(s).
+         */
         sfence
         RET
         .endm
 
 FUNC(clear_page_cold)
         ALTERNATIVE clear_page_sse2, clear_page_clzero, X86_FEATURE_CLZERO
+        /*
+         * Having the RET (and the accompanying SFENCE) here instead of in the
+         * clear_page_sse2 macro above makes sure it won't be overwritten with
+         * a NOP by alternatives patching.
+         */
+        sfence
+        RET
 END(clear_page_cold)
 
         .macro clear_page_stosb
--- a/xen/arch/x86/include/asm/asm-defns.h
+++ b/xen/arch/x86/include/asm/asm-defns.h
@@ -41,7 +41,10 @@
 .endm
 
 #ifdef CONFIG_RETURN_THUNK
-# define RET jmp __x86_return_thunk
+# define RET 1: jmp __x86_return_thunk;                     \
+             .pushsection .ret_thunk_sites, "a", @progbits; \
+             .long 1b - .;                                  \
+             .popsection
 #else
 # define RET ret
 #endif
--- a/xen/arch/x86/memcpy.S
+++ b/xen/arch/x86/memcpy.S
@@ -1,5 +1,10 @@
 #include <asm/asm_defns.h>
 
+        .macro rep_movsb_ret
+        rep movsb
+        RET
+        .endm
+
 FUNC(memcpy)
         mov     %rdx, %rcx
         mov     %rdi, %rax
@@ -10,7 +15,7 @@ FUNC(memcpy)
          * precautions were taken).
          */
         ALTERNATIVE "and $7, %edx; shr $3, %rcx", \
-                    STR(rep movsb; RET), X86_FEATURE_ERMS
+                    rep_movsb_ret, X86_FEATURE_ERMS
         rep movsq
         or      %edx, %ecx
         jz      1f
--- a/xen/arch/x86/memset.S
+++ b/xen/arch/x86/memset.S
@@ -13,7 +13,6 @@
         rep stosb
 0:
         mov     %r8, %rax
-        RET
 .endm
 
 .macro memset_erms
@@ -21,10 +20,19 @@
         mov     %rdi, %r8
         rep stosb
         mov     %r8, %rax
+        /*
+         * Have a (seemingly redundant) RET here to avoid executing
+         * a longer (sequence of) NOP(s).
+         */
         RET
 .endm
 
 FUNC(memset)
         mov     %rdx, %rcx
         ALTERNATIVE memset, memset_erms, X86_FEATURE_ERMS
+        /*
+         * Having the RET here instead of in the memset macro above makes
+         * sure it won't be overwritten with a NOP by alternatives patching.
+         */
+        RET
 END(memset)
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -267,6 +267,11 @@ SECTIONS
         *(.alt_call_sites)
         __alt_call_sites_end = .;
 
+        . = ALIGN(4);
+        __ret_thunk_sites_start = .;
+        *(.ret_thunk_sites)
+        __ret_thunk_sites_end = .;
+
        LOCK_PROFILE_DATA
 
        . = ALIGN(8);

Reply via email to