commit xen for openSUSE:Factory

Source-Sync Fri, 22 Sep 2023 12:48:34 -0700

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2023-09-22 21:47:14
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.1770 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "xen"

Fri Sep 22 21:47:14 2023 rev:336 rq:1112599 version:4.17.2_04

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2023-08-11 15:55:25.499724005 
+0200
+++ /work/SRC/openSUSE:Factory/.xen.new.1770/xen.changes        2023-09-22 
21:48:15.474698284 +0200
@@ -1,0 +2,30 @@
+Mon Sep 18 11:36:39 MDT 2023 - carn...@suse.com
+
+- bsc#1215474 - VUL-0: CVE-2023-20588: xen: AMD CPU transitional
+  execution leak via division by zero (XSA-439)
+  xsa439-00.patch
+  xsa439-01.patch
+  xsa439-02.patch
+  xsa439-03.patch
+  xsa439-04.patch
+  xsa439-05.patch
+  xsa439-06.patch
+  xsa439-07.patch
+  xsa439-08.patch
+  xsa439-09.patch
+
+-------------------------------------------------------------------
+Fri Sep  8 10:10:18 MDT 2023 - carn...@suse.com
+
+- bsc#1215145 - VUL-0: CVE-2023-34322: xen: top-level shadow
+  reference dropped too early for 64-bit PV guests (XSA-438)
+  xsa438.patch
+
+-------------------------------------------------------------------
+Sun Aug 13 13:13:13 UTC 2023 - oher...@suse.de
+
+- Handle potential unaligned access to bitmap in
+  libxc-sr-restore-hvm-legacy-superpage.patch
+  If setting BITS_PER_LONG at once, the initial bit must be aligned
+
+-------------------------------------------------------------------

New:
----
  xsa438.patch
  xsa439-00.patch
  xsa439-01.patch
  xsa439-02.patch
  xsa439-03.patch
  xsa439-04.patch
  xsa439-05.patch
  xsa439-06.patch
  xsa439-07.patch
  xsa439-08.patch
  xsa439-09.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.sat1Rp/_old  2023-09-22 21:48:22.502953429 +0200
+++ /var/tmp/diff_new_pack.sat1Rp/_new  2023-09-22 21:48:22.506953574 +0200
@@ -119,7 +119,7 @@
 %endif
 Provides:       installhint(reboot-needed)
 
-Version:        4.17.2_02
+Version:        4.17.2_04
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -160,6 +160,17 @@
 Patch3:         643e387f-xen-update-CONFIG_DEBUG_INFO-help-text.patch
 Patch4:         6447a8fd-x86-EFI-permit-crash-dump-analysis.patch
 Patch5:         64d33a57-libxenstat-Linux-nul-terminate-string.patch
+Patch9:         xsa438.patch
+Patch10:        xsa439-00.patch
+Patch11:        xsa439-01.patch
+Patch12:        xsa439-02.patch
+Patch13:        xsa439-03.patch
+Patch14:        xsa439-04.patch
+Patch15:        xsa439-05.patch
+Patch16:        xsa439-06.patch
+Patch17:        xsa439-07.patch
+Patch18:        xsa439-08.patch
+Patch19:        xsa439-09.patch
 # EMBARGOED security fixes
 # libxc
 Patch301:       libxc-bitmap-long.patch

++++++ libxc-sr-restore-hvm-legacy-superpage.patch ++++++
--- /var/tmp/diff_new_pack.sat1Rp/_old  2023-09-22 21:48:22.678959819 +0200
+++ /var/tmp/diff_new_pack.sat1Rp/_new  2023-09-22 21:48:22.678959819 +0200
@@ -438,7 +438,7 @@
 +        return -1;
 +
 +    do {
-+        if ( sp.count >= BITS_PER_LONG ) {
++        if ( sp.count >= BITS_PER_LONG && (sp.count % BITS_PER_LONG) == 0 ) {
 +            sp.count -= BITS_PER_LONG;
 +            ctx->restore.tot_pages += BITS_PER_LONG;
 +            pfn_set_long_allocated(ctx, sp.base_pfn + sp.count);

++++++ xsa438.patch ++++++
From: Jan Beulich <jbeul...@suse.com>
Subject: x86/shadow: defer releasing of PV's top-level shadow reference

sh_set_toplevel_shadow() re-pinning the top-level shadow we may be
running on is not enough (and at the same time unnecessary when the
shadow isn't what we're running on): That shadow becomes eligible for
blowing away (from e.g. shadow_prealloc()) immediately after the
paging lock was dropped. Yet it needs to remain valid until the actual
page table switch occurred.

Propagate up the call chain the shadow entry that needs releasing
eventually, and carry out the release immediately after switching page
tables. Handle update_cr3() failures by switching to idle pagetables.
Note that various further uses of update_cr3() are HVM-only or only act
on paused vCPU-s, in which case sh_set_toplevel_shadow() will not defer
releasing of the reference.

While changing the update_cr3() hook, also convert the "do_locking"
parameter to boolean.

This is CVE-2023-34322 / XSA-438.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: George Dunlap <george.dun...@cloud.com>

--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -552,7 +552,7 @@ void audit_domains(void);
 #endif
 
 void make_cr3(struct vcpu *v, mfn_t mfn);
-void update_cr3(struct vcpu *v);
+pagetable_t update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 
--- a/xen/arch/x86/include/asm/paging.h
+++ b/xen/arch/x86/include/asm/paging.h
@@ -138,7 +138,7 @@ struct paging_mode {
                                             paddr_t ga, uint32_t *pfec,
                                             unsigned int *page_order);
 #endif
-    void          (*update_cr3            )(struct vcpu *v, int do_locking,
+    pagetable_t   (*update_cr3            )(struct vcpu *v, bool do_locking,
                                             bool noflush);
     void          (*update_paging_modes   )(struct vcpu *v);
     bool          (*flush_tlb             )(const unsigned long *vcpu_bitmap);
@@ -310,9 +310,9 @@ static inline unsigned long paging_ga_to
 /* Update all the things that are derived from the guest's CR3.
  * Called when the guest changes CR3; the caller can then use v->arch.cr3
  * as the value to load into the host CR3 to schedule this vcpu */
-static inline void paging_update_cr3(struct vcpu *v, bool noflush)
+static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush)
 {
-    paging_get_hostmode(v)->update_cr3(v, 1, noflush);
+    return paging_get_hostmode(v)->update_cr3(v, 1, noflush);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
--- a/xen/arch/x86/include/asm/shadow.h
+++ b/xen/arch/x86/include/asm/shadow.h
@@ -99,6 +99,9 @@ int shadow_set_allocation(struct domain
 
 int shadow_get_allocation_bytes(struct domain *d, uint64_t *size);
 
+/* Helper to invoke for deferred releasing of a top-level shadow's reference. 
*/
+void shadow_put_top_level(struct domain *d, pagetable_t old);
+
 #else /* !CONFIG_SHADOW_PAGING */
 
 #define shadow_vcpu_teardown(v) ASSERT(is_pv_vcpu(v))
@@ -121,6 +124,11 @@ static inline void shadow_prepare_page_t
 
 static inline void shadow_blow_tables_per_domain(struct domain *d) {}
 
+static inline void shadow_put_top_level(struct domain *d, pagetable_t old)
+{
+    ASSERT_UNREACHABLE();
+}
+
 static inline int shadow_domctl(struct domain *d,
                                 struct xen_domctl_shadow_op *sc,
                                 XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -739,11 +739,13 @@ static bool cf_check hap_invlpg(struct v
     return 1;
 }
 
-static void cf_check hap_update_cr3(
-    struct vcpu *v, int do_locking, bool noflush)
+static pagetable_t cf_check hap_update_cr3(
+    struct vcpu *v, bool do_locking, bool noflush)
 {
     v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3];
     hvm_update_guest_cr3(v, noflush);
+
+    return pagetable_null();
 }
 
 static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2590,13 +2590,13 @@ void cf_check shadow_update_paging_modes
 }
 
 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
-void sh_set_toplevel_shadow(struct vcpu *v,
-                            unsigned int slot,
-                            mfn_t gmfn,
-                            unsigned int root_type,
-                            mfn_t (*make_shadow)(struct vcpu *v,
-                                                 mfn_t gmfn,
-                                                 uint32_t shadow_type))
+pagetable_t sh_set_toplevel_shadow(struct vcpu *v,
+                                   unsigned int slot,
+                                   mfn_t gmfn,
+                                   unsigned int root_type,
+                                   mfn_t (*make_shadow)(struct vcpu *v,
+                                                        mfn_t gmfn,
+                                                        uint32_t shadow_type))
 {
     mfn_t smfn;
     pagetable_t old_entry, new_entry;
@@ -2653,20 +2653,37 @@ void sh_set_toplevel_shadow(struct vcpu
                   mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
     v->arch.paging.shadow.shadow_table[slot] = new_entry;
 
-    /* Decrement the refcount of the old contents of this slot */
-    if ( !pagetable_is_null(old_entry) )
+    /*
+     * Decrement the refcount of the old contents of this slot, unless
+     * we're still running on that shadow - in that case it'll need holding
+     * on to until the actual page table switch did occur.
+     */
+    if ( !pagetable_is_null(old_entry) && (v != current || !is_pv_domain(d)) )
     {
-        mfn_t old_smfn = pagetable_get_mfn(old_entry);
-        /* Need to repin the old toplevel shadow if it's been unpinned
-         * by shadow_prealloc(): in PV mode we're still running on this
-         * shadow and it's not safe to free it yet. */
-        if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(d, old_smfn) )
-        {
-            printk(XENLOG_G_ERR "can't re-pin %"PRI_mfn"\n", mfn_x(old_smfn));
-            domain_crash(d);
-        }
-        sh_put_ref(d, old_smfn, 0);
+        sh_put_ref(d, pagetable_get_mfn(old_entry), 0);
+        old_entry = pagetable_null();
     }
+
+    /*
+     * 2- and 3-level shadow mode is used for HVM only. Therefore we never run
+     * on such a shadow, so only call sites requesting an L4 shadow need to pay
+     * attention to the returned value.
+     */
+    ASSERT(pagetable_is_null(old_entry) || root_type == SH_type_l4_64_shadow);
+
+    return old_entry;
+}
+
+/*
+ * Helper invoked when releasing of a top-level shadow's reference was
+ * deferred in sh_set_toplevel_shadow() above.
+ */
+void shadow_put_top_level(struct domain *d, pagetable_t old_entry)
+{
+    ASSERT(!pagetable_is_null(old_entry));
+    paging_lock(d);
+    sh_put_ref(d, pagetable_get_mfn(old_entry), 0);
+    paging_unlock(d);
 }
 
 /**************************************************************************/
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3224,7 +3224,8 @@ static void cf_check sh_detach_old_table
     }
 }
 
-static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
+static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking,
+                                          bool noflush)
 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
  * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
  * if appropriate).
@@ -3238,6 +3239,7 @@ static void cf_check sh_update_cr3(struc
 {
     struct domain *d = v->domain;
     mfn_t gmfn;
+    pagetable_t old_entry = pagetable_null();
 #if GUEST_PAGING_LEVELS == 3
     const guest_l3e_t *gl3e;
     unsigned int i, guest_idx;
@@ -3247,7 +3249,7 @@ static void cf_check sh_update_cr3(struc
     if ( !is_hvm_domain(d) && !v->is_initialised )
     {
         ASSERT(v->arch.cr3 == 0);
-        return;
+        return old_entry;
     }
 
     if ( do_locking ) paging_lock(v->domain);
@@ -3320,11 +3322,12 @@ static void cf_check sh_update_cr3(struc
 #if GUEST_PAGING_LEVELS == 4
     if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
         guest_flush_tlb_mask(d, d->dirty_cpumask);
-    sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
+    old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow,
+                                       sh_make_shadow);
     if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
     {
         ASSERT(d->is_dying || d->is_shutting_down);
-        return;
+        return old_entry;
     }
     if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
     {
@@ -3368,24 +3371,30 @@ static void cf_check sh_update_cr3(struc
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
                 gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt);
                 if ( p2m_is_ram(p2mt) )
-                    sh_set_toplevel_shadow(v, i, gl2mfn, SH_type_l2_shadow,
-                                           sh_make_shadow);
+                    old_entry = sh_set_toplevel_shadow(v, i, gl2mfn,
+                                                       SH_type_l2_shadow,
+                                                       sh_make_shadow);
                 else
-                    sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
-                                           sh_make_shadow);
+                    old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
+                                                       sh_make_shadow);
             }
             else
-                sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow);
+                old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
+                                                   sh_make_shadow);
+
+            ASSERT(pagetable_is_null(old_entry));
         }
     }
 #elif GUEST_PAGING_LEVELS == 2
     if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
         guest_flush_tlb_mask(d, d->dirty_cpumask);
-    sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
+    old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow,
+                                       sh_make_shadow);
+    ASSERT(pagetable_is_null(old_entry));
     if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
     {
         ASSERT(d->is_dying || d->is_shutting_down);
-        return;
+        return old_entry;
     }
 #else
 #error This should never happen
@@ -3473,6 +3482,8 @@ static void cf_check sh_update_cr3(struc
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) paging_unlock(v->domain);
+
+    return old_entry;
 }
 
 
--- a/xen/arch/x86/mm/shadow/none.c
+++ b/xen/arch/x86/mm/shadow/none.c
@@ -52,9 +52,11 @@ static unsigned long cf_check _gva_to_gf
 }
 #endif
 
-static void cf_check _update_cr3(struct vcpu *v, int do_locking, bool noflush)
+static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking,
+                                        bool noflush)
 {
     ASSERT_UNREACHABLE();
+    return pagetable_null();
 }
 
 static void cf_check _update_paging_modes(struct vcpu *v)
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -391,13 +391,13 @@ mfn_t shadow_alloc(struct domain *d,
 void  shadow_free(struct domain *d, mfn_t smfn);
 
 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
-void sh_set_toplevel_shadow(struct vcpu *v,
-                            unsigned int slot,
-                            mfn_t gmfn,
-                            unsigned int root_type,
-                            mfn_t (*make_shadow)(struct vcpu *v,
-                                                 mfn_t gmfn,
-                                                 uint32_t shadow_type));
+pagetable_t sh_set_toplevel_shadow(struct vcpu *v,
+                                   unsigned int slot,
+                                   mfn_t gmfn,
+                                   unsigned int root_type,
+                                   mfn_t (*make_shadow)(struct vcpu *v,
+                                                        mfn_t gmfn,
+                                                        uint32_t shadow_type));
 
 /* Update the shadows in response to a pagetable write from Xen */
 int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size);
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -567,15 +567,12 @@ void write_ptbase(struct vcpu *v)
  *
  * Update ref counts to shadow tables appropriately.
  */
-void update_cr3(struct vcpu *v)
+pagetable_t update_cr3(struct vcpu *v)
 {
     mfn_t cr3_mfn;
 
     if ( paging_mode_enabled(v->domain) )
-    {
-        paging_update_cr3(v, false);
-        return;
-    }
+        return paging_update_cr3(v, false);
 
     if ( !(v->arch.flags & TF_kernel_mode) )
         cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user);
@@ -583,6 +580,8 @@ void update_cr3(struct vcpu *v)
         cr3_mfn = pagetable_get_mfn(v->arch.guest_table);
 
     make_cr3(v, cr3_mfn);
+
+    return pagetable_null();
 }
 
 static inline void set_tlbflush_timestamp(struct page_info *page)
@@ -3285,6 +3284,7 @@ int new_guest_cr3(mfn_t mfn)
     struct domain *d = curr->domain;
     int rc;
     mfn_t old_base_mfn;
+    pagetable_t old_shadow;
 
     if ( is_pv_32bit_domain(d) )
     {
@@ -3352,9 +3352,22 @@ int new_guest_cr3(mfn_t mfn)
     if ( !VM_ASSIST(d, m2p_strict) )
         fill_ro_mpt(mfn);
     curr->arch.guest_table = pagetable_from_mfn(mfn);
-    update_cr3(curr);
+    old_shadow = update_cr3(curr);
+
+    /*
+     * In shadow mode update_cr3() can fail, in which case here we're still
+     * running on the prior top-level shadow (which we're about to release).
+     * Switch to the idle page tables in such an event; the guest will have
+     * been crashed already.
+     */
+    if ( likely(!mfn_eq(pagetable_get_mfn(old_shadow),
+                        maddr_to_mfn(curr->arch.cr3 & ~X86_CR3_NOFLUSH))) )
+        write_ptbase(curr);
+    else
+        write_ptbase(idle_vcpu[curr->processor]);
 
-    write_ptbase(curr);
+    if ( !pagetable_is_null(old_shadow) )
+        shadow_put_top_level(d, old_shadow);
 
     if ( likely(mfn_x(old_base_mfn) != 0) )
     {
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -424,10 +424,13 @@ bool __init xpti_pcid_enabled(void)
 
 static void _toggle_guest_pt(struct vcpu *v)
 {
+    bool guest_update;
+    pagetable_t old_shadow;
     unsigned long cr3;
 
     v->arch.flags ^= TF_kernel_mode;
-    update_cr3(v);
+    guest_update = v->arch.flags & TF_kernel_mode;
+    old_shadow = update_cr3(v);
 
     /*
      * Don't flush user global mappings from the TLB. Don't tick TLB clock.
@@ -436,13 +439,31 @@ static void _toggle_guest_pt(struct vcpu
      * TLB flush (for just the incoming PCID), as the top level page table may
      * have changed behind our backs. To be on the safe side, suppress the
      * no-flush unconditionally in this case.
+     *
+     * Furthermore in shadow mode update_cr3() can fail, in which case here
+     * we're still running on the prior top-level shadow (which we're about
+     * to release). Switch to the idle page tables in such an event; the
+     * guest will have been crashed already.
      */
     cr3 = v->arch.cr3;
     if ( shadow_mode_enabled(v->domain) )
+    {
         cr3 &= ~X86_CR3_NOFLUSH;
+
+        if ( unlikely(mfn_eq(pagetable_get_mfn(old_shadow),
+                             maddr_to_mfn(cr3))) )
+        {
+            cr3 = idle_vcpu[v->processor]->arch.cr3;
+            /* Also suppress runstate/time area updates below. */
+            guest_update = false;
+        }
+    }
     write_cr3(cr3);
 
-    if ( !(v->arch.flags & TF_kernel_mode) )
+    if ( !pagetable_is_null(old_shadow) )
+        shadow_put_top_level(v->domain, old_shadow);
+
+    if ( !guest_update )
         return;
 
     if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) )

++++++ xsa439-00.patch ++++++
Subject: x86/AMD: extend Zenbleed check to models "good" ucode isn't known for
From: Jan Beulich jbeul...@suse.com Wed Aug 23 09:26:36 2023 +0200
Date: Wed Aug 23 09:26:36 2023 +0200:
Git: 145a69c0944ac70cfcf9d247c85dee9e99d9d302

Reportedly the AMD Custom APU 0405 found on SteamDeck, models 0x90 and
0x91, (quoting the respective Linux commit) is similarly affected. Put
another instance of our Zen1 vs Zen2 distinction checks in
amd_check_zenbleed(), forcing use of the chickenbit irrespective of
ucode version (building upon real hardware never surfacing a version of
0xffffffff).

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -909,10 +909,17 @@ void amd_check_zenbleed(void)
        case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
        default:
                /*
-                * With the Fam17h check above, parts getting here are Zen1.
-                * They're not affected.
+                * With the Fam17h check above, most parts getting here are
+                * Zen1.  They're not affected.  Assume Zen2 ones making it
+                * here are affected regardless of microcode version.
+                *
+                * Zen1 vs Zen2 isn't a simple model number comparison, so use
+                * STIBP as a heuristic to distinguish.
                 */
-               return;
+               if (!boot_cpu_has(X86_FEATURE_AMD_STIBP))
+                       return;
+               good_rev = ~0U;
+               break;
        }
 
        rdmsrl(MSR_AMD64_DE_CFG, val);

++++++ xsa439-01.patch ++++++
Subject: x86/spec-ctrl: Fix confusion between SPEC_CTRL_EXIT_TO_XEN{,_IST}
From: Andrew Cooper andrew.coop...@citrix.com Tue Sep 12 15:06:49 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 1c18d73774533a55ba9d1cbee8bdace03efdb5e7

c/s 3fffaf9c13e9 ("x86/entry: Avoid using alternatives in NMI/#MC paths")
dropped the only user, leaving behind the (incorrect) implication that Xen had
split exit paths.

Delete the unused SPEC_CTRL_EXIT_TO_XEN and rename SPEC_CTRL_EXIT_TO_XEN_IST
to SPEC_CTRL_EXIT_TO_XEN for consistency.

No functional change.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -79,7 +79,6 @@
  *  - SPEC_CTRL_ENTRY_FROM_PV
  *  - SPEC_CTRL_ENTRY_FROM_INTR
  *  - SPEC_CTRL_ENTRY_FROM_INTR_IST
- *  - SPEC_CTRL_EXIT_TO_XEN_IST
  *  - SPEC_CTRL_EXIT_TO_XEN
  *  - SPEC_CTRL_EXIT_TO_PV
  *
@@ -268,11 +267,6 @@
     ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1),         \
         X86_FEATURE_SC_MSR_PV
 
-/* Use when exiting to Xen context. */
-#define SPEC_CTRL_EXIT_TO_XEN                                           \
-    ALTERNATIVE "",                                                     \
-        DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_SC_MSR_PV
-
 /* Use when exiting to PV guest context. */
 #define SPEC_CTRL_EXIT_TO_PV                                            \
     ALTERNATIVE "",                                                     \
@@ -339,8 +333,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
     UNLIKELY_END(\@_serialise)
 .endm
 
-/* Use when exiting to Xen in IST context. */
-.macro SPEC_CTRL_EXIT_TO_XEN_IST
+/* Use when exiting to Xen context. */
+.macro SPEC_CTRL_EXIT_TO_XEN
 /*
  * Requires %rbx=stack_end
  * Clobbers %rax, %rcx, %rdx
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -673,7 +673,7 @@ UNLIKELY_START(ne, exit_cr3)
 UNLIKELY_END(exit_cr3)
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_XEN_IST /* Req: %rbx=end, Clob: acd */
+        SPEC_CTRL_EXIT_TO_XEN     /* Req: %rbx=end, Clob: acd */
 
         RESTORE_ALL adj=8
         iretq

++++++ xsa439-02.patch ++++++
Subject: x86/spec-ctrl: Fold DO_SPEC_CTRL_EXIT_TO_XEN into it's single user
From: Andrew Cooper andrew.coop...@citrix.com Tue Sep 12 17:03:16 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 694bb0f280fd08a4377e36e32b84b5062def4de2

With the SPEC_CTRL_EXIT_TO_XEN{,_IST} confusion fixed, it's now obvious that
there's only a single EXIT_TO_XEN path.  Fold DO_SPEC_CTRL_EXIT_TO_XEN into
SPEC_CTRL_EXIT_TO_XEN to simplify further fixes.

When merging labels, switch the name to .L\@_skip_sc_msr as "skip" on its own
is going to be too generic shortly.

No functional change.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -211,27 +211,6 @@
     wrmsr
 .endm
 
-.macro DO_SPEC_CTRL_EXIT_TO_XEN
-/*
- * Requires %rbx=stack_end
- * Clobbers %rax, %rcx, %rdx
- *
- * When returning to Xen context, look to see whether SPEC_CTRL shadowing is
- * in effect, and reload the shadow value.  This covers race conditions which
- * exist with an NMI/MCE/etc hitting late in the return-to-guest path.
- */
-    xor %edx, %edx
-
-    testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
-    jz .L\@_skip
-
-    mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax
-    mov $MSR_SPEC_CTRL, %ecx
-    wrmsr
-
-.L\@_skip:
-.endm
-
 .macro DO_SPEC_CTRL_EXIT_TO_GUEST
 /*
  * Requires %eax=spec_ctrl, %rsp=regs/cpuinfo
@@ -340,11 +319,24 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
  * Clobbers %rax, %rcx, %rdx
  */
     testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
-    jz .L\@_skip
+    jz .L\@_skip_sc_msr
+
+    /*
+     * When returning to Xen context, look to see whether SPEC_CTRL shadowing
+     * is in effect, and reload the shadow value.  This covers race conditions
+     * which exist with an NMI/MCE/etc hitting late in the return-to-guest
+     * path.
+     */
+    xor %edx, %edx
+
+    testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
+    jz .L\@_skip_sc_msr
 
-    DO_SPEC_CTRL_EXIT_TO_XEN
+    mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax
+    mov $MSR_SPEC_CTRL, %ecx
+    wrmsr
 
-.L\@_skip:
+.L\@_skip_sc_msr:
 .endm
 
 #endif /* __ASSEMBLY__ */

++++++ xsa439-03.patch ++++++
Subject: x86/spec-ctrl: Turn the remaining SPEC_CTRL_{ENTRY,EXIT}_* into asm 
macros
From: Andrew Cooper andrew.coop...@citrix.com Fri Sep 1 11:38:44 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 7125429aafb9e3c9c88fc93001fc2300e0ac2cc8

These have grown more complex over time, with some already having been
converted.

Provide full Requires/Clobbers comments, otherwise missing at this level of
indirection.

No functional change.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -231,26 +231,45 @@
 .endm
 
 /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */
-#define SPEC_CTRL_ENTRY_FROM_PV                                         \
+.macro SPEC_CTRL_ENTRY_FROM_PV
+/*
+ * Requires %rsp=regs/cpuinfo, %rdx=0
+ * Clobbers %rax, %rcx, %rdx
+ */
     ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0),     \
-        X86_FEATURE_IBPB_ENTRY_PV;                                      \
-    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV;            \
+        X86_FEATURE_IBPB_ENTRY_PV
+
+    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV
+
     ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0),         \
         X86_FEATURE_SC_MSR_PV
+.endm
 
 /* Use in interrupt/exception context.  May interrupt Xen or PV context. */
-#define SPEC_CTRL_ENTRY_FROM_INTR                                       \
+.macro SPEC_CTRL_ENTRY_FROM_INTR
+/*
+ * Requires %rsp=regs, %r14=stack_end, %rdx=0
+ * Clobbers %rax, %rcx, %rdx
+ */
     ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1),     \
-        X86_FEATURE_IBPB_ENTRY_PV;                                      \
-    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV;            \
+        X86_FEATURE_IBPB_ENTRY_PV
+
+    ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV
+
     ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1),         \
         X86_FEATURE_SC_MSR_PV
+.endm
 
 /* Use when exiting to PV guest context. */
-#define SPEC_CTRL_EXIT_TO_PV                                            \
-    ALTERNATIVE "",                                                     \
-        DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
+.macro SPEC_CTRL_EXIT_TO_PV
+/*
+ * Requires %rax=spec_ctrl, %rsp=regs/info
+ * Clobbers %rcx, %rdx
+ */
+    ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV
+
     DO_SPEC_CTRL_COND_VERW
+.endm
 
 /*
  * Use in IST interrupt/exception context.  May interrupt Xen or PV context.

++++++ xsa439-04.patch ++++++
Subject: x86/spec-ctrl: Improve all SPEC_CTRL_{ENTER,EXIT}_* comments
From: Andrew Cooper andrew.coop...@citrix.com Wed Aug 30 20:11:50 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 45f00557350dc7d0756551069803fc49c29184ca

... to better explain how they're used.

Doing so highlights that SPEC_CTRL_EXIT_TO_XEN is missing a VERW flush for the
corner case when e.g. an NMI hits late in an exit-to-guest path.

Leave a TODO, which will be addressed in subsequent patches which arrange for
VERW flushing to be safe within SPEC_CTRL_EXIT_TO_XEN.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -230,7 +230,10 @@
     wrmsr
 .endm
 
-/* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */
+/*
+ * Used after an entry from PV context: SYSCALL, SYSENTER, INT,
+ * etc.  There is always a guest speculation state in context.
+ */
 .macro SPEC_CTRL_ENTRY_FROM_PV
 /*
  * Requires %rsp=regs/cpuinfo, %rdx=0
@@ -245,7 +248,11 @@
         X86_FEATURE_SC_MSR_PV
 .endm
 
-/* Use in interrupt/exception context.  May interrupt Xen or PV context. */
+/*
+ * Used after an exception or maskable interrupt, hitting Xen or PV context.
+ * There will either be a guest speculation context, or (barring fatal
+ * exceptions) a well-formed Xen speculation context.
+ */
 .macro SPEC_CTRL_ENTRY_FROM_INTR
 /*
  * Requires %rsp=regs, %r14=stack_end, %rdx=0
@@ -260,7 +267,10 @@
         X86_FEATURE_SC_MSR_PV
 .endm
 
-/* Use when exiting to PV guest context. */
+/*
+ * Used when exiting from any entry context, back to PV context.  This
+ * includes from an IST entry which moved onto the primary stack.
+ */
 .macro SPEC_CTRL_EXIT_TO_PV
 /*
  * Requires %rax=spec_ctrl, %rsp=regs/info
@@ -272,7 +282,13 @@
 .endm
 
 /*
- * Use in IST interrupt/exception context.  May interrupt Xen or PV context.
+ * Used after an IST entry hitting Xen or PV context.  Special care is needed,
+ * because when hitting Xen context, there may not be a well-formed
+ * speculation context.  (i.e. it can hit in the middle of
+ * SPEC_CTRL_{ENTRY,EXIT}_* regions.)
+ *
+ * An IST entry which hits PV context moves onto the primary stack and leaves
+ * via SPEC_CTRL_EXIT_TO_PV, *not* SPEC_CTRL_EXIT_TO_XEN.
  */
 .macro SPEC_CTRL_ENTRY_FROM_INTR_IST
 /*
@@ -331,7 +347,14 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
     UNLIKELY_END(\@_serialise)
 .endm
 
-/* Use when exiting to Xen context. */
+/*
+ * Use when exiting from any entry context, back to Xen context.  This
+ * includes returning to other SPEC_CTRL_{ENTRY,EXIT}_* regions with an
+ * incomplete speculation context.
+ *
+ * Because we might have interrupted Xen beyond SPEC_CTRL_EXIT_TO_$GUEST, we
+ * need to treat this as if it were an EXIT_TO_$GUEST case too.
+ */
 .macro SPEC_CTRL_EXIT_TO_XEN
 /*
  * Requires %rbx=stack_end
@@ -356,6 +379,9 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
     wrmsr
 
 .L\@_skip_sc_msr:
+
+    /* TODO VERW */
+
 .endm
 
 #endif /* __ASSEMBLY__ */

++++++ xsa439-05.patch ++++++
Subject: x86/entry: Adjust restore_all_xen to hold stack_end in %r14
From: Andrew Cooper andrew.coop...@citrix.com Wed Sep 13 13:48:16 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 7aa28849a1155d856e214e9a80a7e65fffdc3e58

All other SPEC_CTRL_{ENTRY,EXIT}_* helpers hold stack_end in %r14.  Adjust it
for consistency.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -357,10 +357,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
  */
 .macro SPEC_CTRL_EXIT_TO_XEN
 /*
- * Requires %rbx=stack_end
+ * Requires %r14=stack_end
  * Clobbers %rax, %rcx, %rdx
  */
-    testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
+    testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
     jz .L\@_skip_sc_msr
 
     /*
@@ -371,10 +371,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
      */
     xor %edx, %edx
 
-    testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
+    testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
     jz .L\@_skip_sc_msr
 
-    mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax
+    mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax
     mov $MSR_SPEC_CTRL, %ecx
     wrmsr
 
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -665,15 +665,15 @@ restore_all_xen:
          * Check whether we need to switch to the per-CPU page tables, in
          * case we return to late PV exit code (from an NMI or #MC).
          */
-        GET_STACK_END(bx)
-        cmpb  $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx)
+        GET_STACK_END(14)
+        cmpb  $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
 UNLIKELY_START(ne, exit_cr3)
-        mov   STACK_CPUINFO_FIELD(pv_cr3)(%rbx), %rax
+        mov   STACK_CPUINFO_FIELD(pv_cr3)(%r14), %rax
         mov   %rax, %cr3
 UNLIKELY_END(exit_cr3)
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_XEN     /* Req: %rbx=end, Clob: acd */
+        SPEC_CTRL_EXIT_TO_XEN     /* Req: %r14=end, Clob: acd */
 
         RESTORE_ALL adj=8
         iretq

++++++ xsa439-06.patch ++++++
Subject: x86/entry: Track the IST-ness of an entry for the exit paths
From: Andrew Cooper andrew.coop...@citrix.com Wed Sep 13 12:20:12 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 21bdc25b05a0f8ab6bc73520a9ca01327360732c

Use %r12 to hold an ist_exit boolean.  This register is zero elsewhere in the
entry/exit asm, so it only needs setting in the IST path.

As this is subtle and fragile, add check_ist_exit() to be used in debugging
builds to cross-check that the ist_exit boolean matches the entry vector.

Write check_ist_exit() it in C, because it's debug only and the logic more
complicated than I care to maintain in asm.

For now, we only need to use this signal in the exit-to-Xen path, but some
exit-to-guest paths happen in IST context too.  Check the correctness in all
exit paths to avoid the logic bit-rotting.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2315,6 +2315,19 @@ void asm_domain_crash_synchronous(unsign
         do_softirq();
 }
 
+#ifdef CONFIG_DEBUG
+void check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
+{
+    const unsigned int ist_mask =
+        (1U << X86_EXC_NMI) | (1U << X86_EXC_DB) |
+        (1U << X86_EXC_DF)  | (1U << X86_EXC_MC);
+    uint8_t ev = regs->entry_vector;
+    bool is_ist = (ev < TRAP_nr) && ((1U << ev) & ist_mask);
+
+    ASSERT(is_ist == ist_exit);
+}
+#endif
+
 /*
  * Local variables:
  * mode: C
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -659,8 +659,15 @@ ENTRY(early_page_fault)
         .section .text.entry, "ax", @progbits
 
         ALIGN
-/* No special register assumptions. */
+/* %r12=ist_exit */
 restore_all_xen:
+
+#ifdef CONFIG_DEBUG
+        mov   %rsp, %rdi
+        mov   %r12, %rsi
+        call  check_ist_exit
+#endif
+
         /*
          * Check whether we need to switch to the per-CPU page tables, in
          * case we return to late PV exit code (from an NMI or #MC).
@@ -1091,6 +1098,10 @@ handle_ist_exception:
 .L_ist_dispatch_done:
         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         mov   %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14)
+
+        /* This is an IST exit */
+        mov   $1, %r12d
+
         cmpb  $TRAP_nmi,UREGS_entry_vector(%rsp)
         jne   ret_from_intr
 

++++++ xsa439-07.patch ++++++
Subject: x86/spec-ctrl: Issue VERW during IST exit to Xen
From: Andrew Cooper andrew.coop...@citrix.com Wed Sep 13 13:53:33 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: 3ee6066bcd737756b0990d417d94eddc0b0d2585

There is a corner case where e.g. an NMI hitting an exit-to-guest path after
SPEC_CTRL_EXIT_TO_* would have run the entire NMI handler *after* the VERW
flush to scrub potentially sensitive data from uarch buffers.

In order to compensate, issue VERW when exiting to Xen from an IST entry.

SPEC_CTRL_EXIT_TO_XEN already has two reads of spec_ctrl_flags off the stack,
and we're about to add a third.  Load the field into %ebx, and list the
register as clobbered.

%r12 has been arranged to be the ist_exit signal, so add this as an input
dependency and use it to identify when to issue a VERW.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -357,10 +357,12 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
  */
 .macro SPEC_CTRL_EXIT_TO_XEN
 /*
- * Requires %r14=stack_end
- * Clobbers %rax, %rcx, %rdx
+ * Requires %r12=ist_exit, %r14=stack_end
+ * Clobbers %rax, %rbx, %rcx, %rdx
  */
-    testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
+    movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx
+
+    testb $SCF_ist_sc_msr, %bl
     jz .L\@_skip_sc_msr
 
     /*
@@ -371,7 +373,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
      */
     xor %edx, %edx
 
-    testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
+    testb $SCF_use_shadow, %bl
     jz .L\@_skip_sc_msr
 
     mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax
@@ -380,8 +382,16 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
 
 .L\@_skip_sc_msr:
 
-    /* TODO VERW */
+    test %r12, %r12
+    jz .L\@_skip_ist_exit
+
+    /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo 
dependency */
+    testb $SCF_verw, %bl
+    jz .L\@_skip_verw
+    verw STACK_CPUINFO_FIELD(verw_sel)(%r14)
+.L\@_skip_verw:
 
+.L\@_skip_ist_exit:
 .endm
 
 #endif /* __ASSEMBLY__ */
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -680,7 +680,7 @@ UNLIKELY_START(ne, exit_cr3)
 UNLIKELY_END(exit_cr3)
 
         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
-        SPEC_CTRL_EXIT_TO_XEN     /* Req: %r14=end, Clob: acd */
+        SPEC_CTRL_EXIT_TO_XEN     /* Req: %r12=ist_exit %r14=end, Clob: abcd */
 
         RESTORE_ALL adj=8
         iretq

++++++ xsa439-08.patch ++++++
Subject: x86/amd: Introduce is_zen{1,2}_uarch() predicates
From: Andrew Cooper andrew.coop...@citrix.com Fri Sep 15 12:13:51 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: de1d265001397f308c5c3c5d3ffc30e7ef8c0705

We already have 3 cases using STIBP as a Zen1/2 heuristic, and are about to
introduce a 4th.  Wrap the heuristic into a pair of predicates rather than
opencoding it, and the explanation of the heuristic, at each usage site.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -855,15 +855,13 @@ void amd_set_legacy_ssbd(bool enable)
  * non-branch instructions to be ignored.  It is to be set unilaterally in
  * newer microcode.
  *
- * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a
- * simple model number comparison, so use STIBP as a heuristic to separate the
- * two uarches in Fam17h(AMD)/18h(Hygon).
+ * This chickenbit is something unrelated on Zen1.
  */
 void amd_init_spectral_chicken(void)
 {
        uint64_t val, chickenbit = 1 << 1;
 
-       if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP))
+       if (cpu_has_hypervisor || !is_zen2_uarch())
                return;
 
        if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit))
@@ -912,11 +910,8 @@ void amd_check_zenbleed(void)
                 * With the Fam17h check above, most parts getting here are
                 * Zen1.  They're not affected.  Assume Zen2 ones making it
                 * here are affected regardless of microcode version.
-                *
-                * Zen1 vs Zen2 isn't a simple model number comparison, so use
-                * STIBP as a heuristic to distinguish.
                 */
-               if (!boot_cpu_has(X86_FEATURE_AMD_STIBP))
+               if (is_zen1_uarch())
                        return;
                good_rev = ~0U;
                break;
@@ -1277,12 +1272,7 @@ static int __init cf_check zen2_c6_errat
         */
        s_time_t delta;
 
-       /*
-        * Zen1 vs Zen2 isn't a simple model number comparison, so use STIBP as
-        * a heuristic to separate the two uarches in Fam17h.
-        */
-       if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 ||
-           !boot_cpu_has(X86_FEATURE_AMD_STIBP))
+       if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch())
                return 0;
 
        /*
--- a/xen/arch/x86/include/asm/amd.h
+++ b/xen/arch/x86/include/asm/amd.h
@@ -140,6 +140,17 @@
                        AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf),     \
                        AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf))
 
+/*
+ * The Zen1 and Zen2 microarchitectures are implemented by AMD (Fam17h) and
+ * Hygon (Fam18h) but without simple model number rules.  Instead, use STIBP
+ * as a heuristic that distinguishes the two.
+ *
+ * The caller is required to perform the appropriate vendor/family checks
+ * first.
+ */
+#define is_zen1_uarch() (!boot_cpu_has(X86_FEATURE_AMD_STIBP))
+#define is_zen2_uarch()   boot_cpu_has(X86_FEATURE_AMD_STIBP)
+
 struct cpuinfo_x86;
 int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...);
 

++++++ xsa439-09.patch ++++++
Subject: x86/spec-ctrl: Mitigate the Zen1 DIV leakage
From: Andrew Cooper andrew.coop...@citrix.com Wed Aug 30 20:24:25 2023 +0100
Date: Mon Sep 18 16:43:01 2023 +0100:
Git: b5926c6ecf05c28ee99c6248c42d691ccbf0c315

In the Zen1 microarchitecure, there is one divider in the pipeline which
services uops from both threads.  In the case of #DE, the latched result from
the previous DIV to execute will be forwarded speculatively.

This is an interesting covert channel that allows two threads to communicate
without any system calls.  In also allows userspace to obtain the result of
the most recent DIV instruction executed (even speculatively) in the core,
which can be from a higher privilege context.

Scrub the result from the divider by executing a non-faulting divide.  This
needs performing on the exit-to-guest paths, and ist_exit-to-Xen.

Alternatives in IST context is believed safe now that it's done in NMI
context.

This is XSA-439 / CVE-2023-20588.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -2315,7 +2315,7 @@ By default SSBD will be mitigated at run
 >              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
 >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
 >              eager-fpu,l1d-flush,branch-harden,srb-lock,
->              unpriv-mmio,gds-mit}=<bool> ]`
+>              unpriv-mmio,gds-mit,div-scrub}=<bool> ]`
 
 Controls for speculative execution sidechannel mitigations.  By default, Xen
 will pick the most appropriate mitigations based on compiled in support,
@@ -2437,6 +2437,10 @@ has elected not to lock the configuratio
 GDS with.  Otherwise, Xen will mitigate by disabling AVX, which blocks the use
 of the AVX2 Gather instructions.
 
+On all hardware, the `div-scrub=` option can be used to force or prevent Xen
+from mitigating the DIV-leakage vulnerability.  By default, Xen will mitigate
+DIV-leakage on hardware believed to be vulnerable.
+
 ### sync_console
 > `= <boolean>`
 
--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -74,6 +74,7 @@ __UNLIKELY_END(nsvm_hap)
 1:          /* No Spectre v1 concerns.  Execution will hit VMRUN imminently. */
         .endm
         ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM
+        ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
 
         pop  %r15
         pop  %r14
--- a/xen/arch/x86/include/asm/cpufeatures.h
+++ b/xen/arch/x86/include/asm/cpufeatures.h
@@ -35,7 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM,        X86_SY
 XEN_CPUFEATURE(XEN_SELFSNOOP,     X86_SYNTH(20)) /* SELFSNOOP gets used by Xen 
itself */
 XEN_CPUFEATURE(SC_MSR_IDLE,       X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on 
idle */
 XEN_CPUFEATURE(XEN_LBR,           X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR 
*/
-/* Bits 23 unused. */
+XEN_CPUFEATURE(SC_DIV,            X86_SYNTH(23)) /* DIV scrub needed */
 XEN_CPUFEATURE(SC_RSB_IDLE,       X86_SYNTH(24)) /* RSB overwrite needed for 
idle. */
 XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle 
*/
 XEN_CPUFEATURE(XEN_SHSTK,         X86_SYNTH(26)) /* Xen uses CET Shadow Stacks 
*/
--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
@@ -177,6 +177,19 @@
 .L\@_verw_skip:
 .endm
 
+.macro DO_SPEC_CTRL_DIV
+/*
+ * Requires nothing
+ * Clobbers %rax
+ *
+ * Issue a DIV for its flushing side effect (Zen1 uarch specific).  Any
+ * non-faulting DIV will do; a byte DIV has least latency, and doesn't clobber
+ * %rdx.
+ */
+    mov $1, %eax
+    div %al
+.endm
+
 .macro DO_SPEC_CTRL_ENTRY maybexen:req
 /*
  * Requires %rsp=regs (also cpuinfo if !maybexen)
@@ -279,6 +292,8 @@
     ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV
 
     DO_SPEC_CTRL_COND_VERW
+
+    ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
 .endm
 
 /*
@@ -391,6 +406,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
     verw STACK_CPUINFO_FIELD(verw_sel)(%r14)
 .L\@_skip_verw:
 
+    ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+
 .L\@_skip_ist_exit:
 .endm
 
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -79,6 +79,7 @@ static int8_t __initdata opt_srb_lock =
 static bool __initdata opt_unpriv_mmio;
 static bool __ro_after_init opt_fb_clear_mmio;
 static int8_t __initdata opt_gds_mit = -1;
+static int8_t __initdata opt_div_scrub = -1;
 
 static int __init cf_check parse_spec_ctrl(const char *s)
 {
@@ -133,6 +134,7 @@ static int __init cf_check parse_spec_ct
             opt_srb_lock = 0;
             opt_unpriv_mmio = false;
             opt_gds_mit = 0;
+            opt_div_scrub = 0;
         }
         else if ( val > 0 )
             rc = -EINVAL;
@@ -285,6 +287,8 @@ static int __init cf_check parse_spec_ct
             opt_unpriv_mmio = val;
         else if ( (val = parse_boolean("gds-mit", s, ss)) >= 0 )
             opt_gds_mit = val;
+        else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 )
+            opt_div_scrub = val;
         else
             rc = -EINVAL;
 
@@ -485,7 +489,7 @@ static void __init print_details(enum in
                "\n");
 
     /* Settings for Xen's protection, irrespective of guests. */
-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, 
Other:%s%s%s%s%s\n",
+    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, 
Other:%s%s%s%s%s%s\n",
            thunk == THUNK_NONE      ? "N/A" :
            thunk == THUNK_RETPOLINE ? "RETPOLINE" :
            thunk == THUNK_LFENCE    ? "LFENCE" :
@@ -510,6 +514,7 @@ static void __init print_details(enum in
            opt_l1d_flush                             ? " L1D_FLUSH" : "",
            opt_md_clear_pv || opt_md_clear_hvm ||
            opt_fb_clear_mmio                         ? " VERW"  : "",
+           opt_div_scrub                             ? " DIV" : "",
            opt_branch_harden                         ? " BRANCH_HARDEN" : "");
 
     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
@@ -967,6 +972,45 @@ static void __init srso_calculations(boo
         setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
 }
 
+/*
+ * The Div leakage issue is specific to the AMD Zen1 microarchitecure.
+ *
+ * However, there's no $FOO_NO bit defined, so if we're virtualised we have no
+ * hope of spotting the case where we might move to vulnerable hardware.  We
+ * also can't make any useful conclusion about SMT-ness.
+ *
+ * Don't check the hypervisor bit, so at least we do the safe thing when
+ * booting on something that looks like a Zen1 CPU.
+ */
+static bool __init has_div_vuln(void)
+{
+    if ( !(boot_cpu_data.x86_vendor &
+           (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
+        return false;
+
+    if ( boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18 )
+        return false;
+
+    return is_zen1_uarch();
+}
+
+static void __init div_calculations(bool hw_smt_enabled)
+{
+    bool cpu_bug_div = has_div_vuln();
+
+    if ( opt_div_scrub == -1 )
+        opt_div_scrub = cpu_bug_div;
+
+    if ( opt_div_scrub )
+        setup_force_cpu_cap(X86_FEATURE_SC_DIV);
+
+    if ( opt_smt == -1 && !cpu_has_hypervisor && cpu_bug_div && hw_smt_enabled 
)
+        warning_add(
+            "Booted on leaky-DIV hardware with SMT/Hyperthreading\n"
+            "enabled.  Please assess your configuration and choose an\n"
+            "explicit 'smt=<bool>' setting.  See XSA-439.\n");
+}
+
 static void __init ibpb_calculations(void)
 {
     bool def_ibpb_entry = false;
@@ -1726,6 +1770,8 @@ void __init init_speculation_mitigations
 
     ibpb_calculations();
 
+    div_calculations(hw_smt_enabled);
+
     /* Check whether Eager FPU should be enabled by default. */
     if ( opt_eager_fpu == -1 )
         opt_eager_fpu = should_use_eager_fpu();

commit xen for openSUSE:Factory

Reply via email to