commit xen for openSUSE:Factory

Source-Sync Tue, 25 Jun 2024 14:07:23 -0700

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2024-06-25 23:06:43
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.18349 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "xen"

Tue Jun 25 23:06:43 2024 rev:346 rq:1183065 version:4.18.2_06

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2024-06-06 12:31:17.085556322 
+0200
+++ /work/SRC/openSUSE:Factory/.xen.new.18349/xen.changes       2024-06-25 
23:07:01.473315130 +0200
@@ -1,0 +2,22 @@
+Mon Jun 24 16:20:00 CEST 2024 - jbeul...@suse.com
+
+- bsc#1214718 - The system hangs intermittently when Power Control
+  Mode is set to Minimum Power on SLES15SP5 Xen
+  6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch
+  666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
+  666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
+  66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
+  6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
+  6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch
+- Upstream bug fixes (bsc#1027519)
+  66450626-sched-set-all-sched_resource-data-inside-locked.patch
+  66450627-x86-respect-mapcache_domain_init-failing.patch
+  6646031f-x86-ucode-further-identify-already-up-to-date.patch
+  666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
+  666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
+  666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
+  667187cc-x86-Intel-unlock-CPUID-earlier.patch
+  6672c846-x86-xstate-initialisation-of-XSS-cache.patch
+  6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch
+
+-------------------------------------------------------------------

New:
----
  66450626-sched-set-all-sched_resource-data-inside-locked.patch
  66450627-x86-respect-mapcache_domain_init-failing.patch
  6646031f-x86-ucode-further-identify-already-up-to-date.patch
  6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch
  666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
  666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
  666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
  666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
  666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
  667187cc-x86-Intel-unlock-CPUID-earlier.patch
  66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
  6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
  6672c846-x86-xstate-initialisation-of-XSS-cache.patch
  6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch
  6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch

BETA DEBUG BEGIN:
  New:- Upstream bug fixes (bsc#1027519)
  66450626-sched-set-all-sched_resource-data-inside-locked.patch
  66450627-x86-respect-mapcache_domain_init-failing.patch
  New:  66450626-sched-set-all-sched_resource-data-inside-locked.patch
  66450627-x86-respect-mapcache_domain_init-failing.patch
  6646031f-x86-ucode-further-identify-already-up-to-date.patch
  New:  66450627-x86-respect-mapcache_domain_init-failing.patch
  6646031f-x86-ucode-further-identify-already-up-to-date.patch
  666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
  New:  Mode is set to Minimum Power on SLES15SP5 Xen
  6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch
  666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
  New:  6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch
  666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
  666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
  New:  666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
  666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
  66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
  New:  6646031f-x86-ucode-further-identify-already-up-to-date.patch
  666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
  666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
  New:  666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
  666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
  666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
  New:  666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
  666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
  667187cc-x86-Intel-unlock-CPUID-earlier.patch
  New:  666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
  667187cc-x86-Intel-unlock-CPUID-earlier.patch
  6672c846-x86-xstate-initialisation-of-XSS-cache.patch
  New:  666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
  66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
  6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
  New:  66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
  6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
  6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch
  New:  667187cc-x86-Intel-unlock-CPUID-earlier.patch
  6672c846-x86-xstate-initialisation-of-XSS-cache.patch
  6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch
  New:  6672c846-x86-xstate-initialisation-of-XSS-cache.patch
  6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch
  New:  6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
  6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch
- Upstream bug fixes (bsc#1027519)
BETA DEBUG END:

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.W2wIIm/_old  2024-06-25 23:07:05.849474652 +0200
+++ /var/tmp/diff_new_pack.W2wIIm/_new  2024-06-25 23:07:05.853474798 +0200
@@ -119,7 +119,7 @@
 %endif
 Provides:       installhint(reboot-needed)
 
-Version:        4.18.2_05
+Version:        4.18.2_06
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -164,6 +164,21 @@
 Patch8:         663a4f3e-x86-cpu-policy-migration-IceLake-to-CascadeLake.patch
 Patch9:         663d05b5-x86-ucode-distinguish-up-to-date.patch
 Patch10:        663eaa27-libxl-XenStore-error-handling-in-device-creation.patch
+Patch11:        66450626-sched-set-all-sched_resource-data-inside-locked.patch
+Patch12:        66450627-x86-respect-mapcache_domain_init-failing.patch
+Patch13:        6646031f-x86-ucode-further-identify-already-up-to-date.patch
+Patch14:        
6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch
+Patch15:        666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch
+Patch16:        666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch
+Patch17:        666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch
+Patch18:        666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch
+Patch19:        666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch
+Patch20:        667187cc-x86-Intel-unlock-CPUID-earlier.patch
+Patch21:        66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch
+Patch22:        6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch
+Patch23:        6672c846-x86-xstate-initialisation-of-XSS-cache.patch
+Patch24:        6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch
+Patch25:        
6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch
 # EMBARGOED security fixes
 # libxc
 Patch301:       libxc-bitmap-long.patch

++++++ 66450626-sched-set-all-sched_resource-data-inside-locked.patch ++++++
# Commit d104a07524ffc92ae7a70dfe192c291de2a563cc
# Date 2024-05-15 19:59:52 +0100
# Author Juergen Gross <jgr...@suse.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
xen/sched: set all sched_resource data inside locked region for new cpu

When adding a cpu to a scheduler, set all data items of struct
sched_resource inside the locked region, as otherwise a race might
happen (e.g. when trying to access the cpupool of the cpu):

  (XEN) ----[ Xen-4.19.0-1-d  x86_64  debug=y  Tainted:     H  ]----
  (XEN) CPU:    45
  (XEN) RIP:    e008:[<ffff82d040244cbf>] 
common/sched/credit.c#csched_load_balance+0x41/0x877
  (XEN) RFLAGS: 0000000000010092   CONTEXT: hypervisor
  (XEN) rax: ffff82d040981618   rbx: ffff82d040981618   rcx: 0000000000000000
  (XEN) rdx: 0000003ff68cd000   rsi: 000000000000002d   rdi: ffff83103723d450
  (XEN) rbp: ffff83207caa7d48   rsp: ffff83207caa7b98   r8:  0000000000000000
  (XEN) r9:  ffff831037253cf0   r10: ffff83103767c3f0   r11: 0000000000000009
  (XEN) r12: ffff831037237990   r13: ffff831037237990   r14: ffff831037253720
  (XEN) r15: 0000000000000000   cr0: 000000008005003b   cr4: 0000000000f526e0
  (XEN) cr3: 000000005bc2f000   cr2: 0000000000000010
  (XEN) fsb: 0000000000000000   gsb: 0000000000000000   gss: 0000000000000000
  (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
  (XEN) Xen code around <ffff82d040244cbf> 
(common/sched/credit.c#csched_load_balance+0x41/0x877):
  (XEN)  48 8b 0c 10 48 8b 49 08 <48> 8b 79 10 48 89 bd b8 fe ff ff 49 8b 4e 28 
48
  <snip>
  (XEN) Xen call trace:
  (XEN)    [<ffff82d040244cbf>] R 
common/sched/credit.c#csched_load_balance+0x41/0x877
  (XEN)    [<ffff82d040245a18>] F 
common/sched/credit.c#csched_schedule+0x36a/0x69f
  (XEN)    [<ffff82d040252644>] F common/sched/core.c#do_schedule+0xe8/0x433
  (XEN)    [<ffff82d0402572dd>] F common/sched/core.c#schedule+0x2e5/0x2f9
  (XEN)    [<ffff82d040232f35>] F common/softirq.c#__do_softirq+0x94/0xbe
  (XEN)    [<ffff82d040232fc8>] F do_softirq+0x13/0x15
  (XEN)    [<ffff82d0403075ef>] F arch/x86/domain.c#idle_loop+0x92/0xe6
  (XEN)
  (XEN) Pagetable walk from 0000000000000010:
  (XEN)  L4[0x000] = 000000103ff61063 ffffffffffffffff
  (XEN)  L3[0x000] = 000000103ff60063 ffffffffffffffff
  (XEN)  L2[0x000] = 0000001033dff063 ffffffffffffffff
  (XEN)  L1[0x000] = 0000000000000000 ffffffffffffffff
  (XEN)
  (XEN) ****************************************
  (XEN) Panic on CPU 45:
  (XEN) FATAL PAGE FAULT
  (XEN) [error_code=0000]
  (XEN) Faulting linear address: 0000000000000010
  (XEN) ****************************************

Reported-by: Andrew Cooper <andrew.coop...@citrix.com>
Fixes: a8c6c623192e ("sched: clarify use cases of schedule_cpu_switch()")
Signed-off-by: Juergen Gross <jgr...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>
Tested-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -3179,6 +3179,8 @@ int schedule_cpu_add(unsigned int cpu, s
 
     sr->scheduler = new_ops;
     sr->sched_priv = ppriv;
+    sr->granularity = cpupool_get_granularity(c);
+    sr->cpupool = c;
 
     /*
      * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
@@ -3191,8 +3193,6 @@ int schedule_cpu_add(unsigned int cpu, s
     /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
     spin_unlock_irqrestore(old_lock, flags);
 
-    sr->granularity = cpupool_get_granularity(c);
-    sr->cpupool = c;
     /* The  cpu is added to a pool, trigger it to go pick up some work */
     cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
 

++++++ 66450627-x86-respect-mapcache_domain_init-failing.patch ++++++
# Commit 7270fdc7a0028d4b7b26fd1b36c6b9e97abcf3da
# Date 2024-05-15 19:59:52 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86: respect mapcache_domain_init() failing

The function itself properly handles and hands onwards failure from
create_perdomain_mapping(). Therefore its caller should respect possible
failure, too.

Fixes: 4b28bf6ae90b ("x86: re-introduce map_domain_page() et al")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -850,7 +850,8 @@ int arch_domain_create(struct domain *d,
     }
     else if ( is_pv_domain(d) )
     {
-        mapcache_domain_init(d);
+        if ( (rc = mapcache_domain_init(d)) != 0 )
+            goto fail;
 
         if ( (rc = pv_domain_initialise(d)) != 0 )
             goto fail;

++++++ 6646031f-x86-ucode-further-identify-already-up-to-date.patch ++++++
# Commit 977d98e67c2e929c62aa1f495fc4c6341c45abb5
# Date 2024-05-16 13:59:11 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/ucode: Further fixes to identify "ucode already up to date"

When the revision in hardware is newer than anything Xen has to hand,
'microcode_cache' isn't set up.  Then, `xen-ucode` initiates the update
because it doesn't know whether the revisions across the system are symmetric
or not.  This involves the patch getting all the way into the
apply_microcode() hooks before being found to be too old.

This is all a giant mess and needs an overhaul, but in the short term simply
adjust the apply_microcode() to return -EEXIST.

Also, unconditionally print the preexisting microcode revision on boot.  It's
relevant information which is otherwise unavailable if Xen doesn't find new
microcode to use.

Fixes: 648db37a155a ("x86/ucode: Distinguish "ucode already up to date"")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/cpu/microcode/amd.c
+++ b/xen/arch/x86/cpu/microcode/amd.c
@@ -222,12 +222,15 @@ static int cf_check apply_microcode(cons
     uint32_t rev, old_rev = sig->rev;
     enum microcode_match_result result = microcode_fits(patch);
 
+    if ( result == MIS_UCODE )
+        return -EINVAL;
+
     /*
      * Allow application of the same revision to pick up SMT-specific changes
      * even if the revision of the other SMT thread is already up-to-date.
      */
-    if ( result != NEW_UCODE && result != SAME_UCODE )
-        return -EINVAL;
+    if ( result == OLD_UCODE )
+        return -EEXIST;
 
     if ( check_final_patch_levels(sig) )
     {
--- a/xen/arch/x86/cpu/microcode/core.c
+++ b/xen/arch/x86/cpu/microcode/core.c
@@ -887,6 +887,8 @@ int __init early_microcode_init(unsigned
 
     ucode_ops.collect_cpu_info();
 
+    printk(XENLOG_INFO "BSP microcode revision: 0x%08x\n", 
this_cpu(cpu_sig).rev);
+
     /*
      * Some hypervisors deliberately report a microcode revision of -1 to
      * mean that they will not accept microcode updates.
--- a/xen/arch/x86/cpu/microcode/intel.c
+++ b/xen/arch/x86/cpu/microcode/intel.c
@@ -294,10 +294,13 @@ static int cf_check apply_microcode(cons
 
     result = microcode_update_match(patch);
 
-    if ( result != NEW_UCODE &&
-         !(opt_ucode_allow_same && result == SAME_UCODE) )
+    if ( result == MIS_UCODE )
         return -EINVAL;
 
+    if ( result == OLD_UCODE ||
+         (result == SAME_UCODE && !opt_ucode_allow_same) )
+        return -EEXIST;
+
     wbinvd();
 
     wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)patch->data);

++++++ 6666ba52-x86-irq-remove-offline-CPUs-from-old-CPU-mask-when.patch ++++++

References: bsc#1214718

# Commit e63209d3ba2fd1b2f232babd14c9c679ffa7b09a
# Date 2024-06-10 10:33:22 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/irq: remove offline CPUs from old CPU mask when adjusting move_cleanup_count

When adjusting move_cleanup_count to account for CPUs that are offline also
adjust old_cpu_mask, otherwise further calls to fixup_irqs() could subtract
those again and create an imbalance in move_cleanup_count.

Fixes: 472e0b74c5c4 ('x86/IRQ: deal with move cleanup count state in 
fixup_irqs()')
Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2576,6 +2576,14 @@ void fixup_irqs(const cpumask_t *mask, b
             desc->arch.move_cleanup_count -= cpumask_weight(affinity);
             if ( !desc->arch.move_cleanup_count )
                 release_old_vec(desc);
+            else
+                /*
+                 * Adjust old_cpu_mask to account for the offline CPUs,
+                 * otherwise further calls to fixup_irqs() could subtract those
+                 * again and possibly underflow the counter.
+                 */
+                cpumask_andnot(desc->arch.old_cpu_mask, 
desc->arch.old_cpu_mask,
+                               affinity);
         }
 
         if ( !desc->action || cpumask_subset(desc->affinity, mask) )

++++++ 666994ab-x86-SMP-no-shorthand-IPI-in-hotplug.patch ++++++

References: bsc#1214718

# Commit 171c52fba5d94e050d704770480dcb983490d0ad
# Date 2024-06-12 14:29:31 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/smp: do not use shorthand IPI destinations in CPU hot{,un}plug contexts

Due to the current rwlock logic, if the CPU calling get_cpu_maps() does
so from a cpu_hotplug_{begin,done}() region the function will still
return success, because a CPU taking the rwlock in read mode after
having taken it in write mode is allowed.  Such corner case makes using
get_cpu_maps() alone not enough to prevent using the shorthand in CPU
hotplug regions.

Introduce a new helper to detect whether the current caller is between a
cpu_hotplug_{begin,done}() region and use it in send_IPI_mask() to restrict
shorthand usage.

Fixes: 5500d265a2a8 ('x86/smp: use APIC ALLBUT destination shorthand when 
possible')
Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -88,7 +88,7 @@ void send_IPI_mask(const cpumask_t *mask
      * the system have been accounted for.
      */
     if ( system_state > SYS_STATE_smp_boot &&
-         !unaccounted_cpus && !disabled_cpus &&
+         !unaccounted_cpus && !disabled_cpus && !cpu_in_hotplug_context() &&
          /* NB: get_cpu_maps lock requires enabled interrupts. */
          local_irq_is_enabled() && (cpus_locked = get_cpu_maps()) &&
          (park_offline_cpus ||
--- a/xen/common/cpu.c
+++ b/xen/common/cpu.c
@@ -68,6 +68,11 @@ void cpu_hotplug_done(void)
     write_unlock(&cpu_add_remove_lock);
 }
 
+bool cpu_in_hotplug_context(void)
+{
+    return rw_is_write_locked_by_me(&cpu_add_remove_lock);
+}
+
 static NOTIFIER_HEAD(cpu_chain);
 
 void __init register_cpu_notifier(struct notifier_block *nb)
--- a/xen/include/xen/cpu.h
+++ b/xen/include/xen/cpu.h
@@ -13,6 +13,16 @@ void put_cpu_maps(void);
 void cpu_hotplug_begin(void);
 void cpu_hotplug_done(void);
 
+/*
+ * Returns true when the caller CPU is between a cpu_hotplug_{begin,done}()
+ * region.
+ *
+ * This is required to safely identify hotplug contexts, as get_cpu_maps()
+ * would otherwise succeed because a caller holding the lock in write mode is
+ * allowed to acquire the same lock in read mode.
+ */
+bool cpu_in_hotplug_context(void);
+
 /* Receive notification of CPU hotplug events. */
 void register_cpu_notifier(struct notifier_block *nb);
 
--- a/xen/include/xen/rwlock.h
+++ b/xen/include/xen/rwlock.h
@@ -309,6 +309,8 @@ static always_inline void write_lock_irq
 
 #define rw_is_locked(l)               _rw_is_locked(l)
 #define rw_is_write_locked(l)         _rw_is_write_locked(l)
+#define rw_is_write_locked_by_me(l) \
+    lock_evaluate_nospec(_is_write_locked_by_me(atomic_read(&(l)->cnts)))
 
 
 typedef struct percpu_rwlock percpu_rwlock_t;

++++++ 666994f0-x86-IRQ-limit-interrupt-movement-in-fixup_irqs.patch ++++++

References: bsc#1214718

# Commit c7564d7366d865cc407e3d64bca816d07edee174
# Date 2024-06-12 14:30:40 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/irq: limit interrupt movement done by fixup_irqs()

The current check used in fixup_irqs() to decide whether to move around
interrupts is based on the affinity mask, but such mask can have all bits set,
and hence is unlikely to be a subset of the input mask.  For example if an
interrupt has an affinity mask of all 1s, any input to fixup_irqs() that's not
an all set CPU mask would cause that interrupt to be shuffled around
unconditionally.

What fixup_irqs() care about is evacuating interrupts from CPUs not set on the
input CPU mask, and for that purpose it should check whether the interrupt is
assigned to a CPU not present in the input mask.  Assume that ->arch.cpu_mask
is a subset of the ->affinity mask, and keep the current logic that resets the
->affinity mask if the interrupt has to be shuffled around.

Doing the affinity movement based on ->arch.cpu_mask requires removing the
special handling to ->arch.cpu_mask done for high priority vectors, otherwise
the adjustment done to cpu_mask makes them always skip the CPU interrupt
movement.

While there also adjust the comment as to the purpose of fixup_irqs().

Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/irq.h
+++ b/xen/arch/x86/include/asm/irq.h
@@ -132,7 +132,7 @@ void free_domain_pirqs(struct domain *d)
 int map_domain_emuirq_pirq(struct domain *d, int pirq, int emuirq);
 int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
 
-/* Reset irq affinities to match the given CPU mask. */
+/* Evacuate interrupts assigned to CPUs not present in the input CPU mask. */
 void fixup_irqs(const cpumask_t *mask, bool verbose);
 void fixup_eoi(void);
 
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2529,7 +2529,7 @@ static int __init cf_check setup_dump_ir
 }
 __initcall(setup_dump_irqs);
 
-/* Reset irq affinities to match the given CPU mask. */
+/* Evacuate interrupts assigned to CPUs not present in the input CPU mask. */
 void fixup_irqs(const cpumask_t *mask, bool verbose)
 {
     unsigned int irq;
@@ -2553,19 +2553,15 @@ void fixup_irqs(const cpumask_t *mask, b
 
         vector = irq_to_vector(irq);
         if ( vector >= FIRST_HIPRIORITY_VECTOR &&
-             vector <= LAST_HIPRIORITY_VECTOR )
+             vector <= LAST_HIPRIORITY_VECTOR &&
+             desc->handler == &no_irq_type )
         {
-            cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
-
             /*
              * This can in particular happen when parking secondary threads
              * during boot and when the serial console wants to use a PCI IRQ.
              */
-            if ( desc->handler == &no_irq_type )
-            {
-                spin_unlock(&desc->lock);
-                continue;
-            }
+            spin_unlock(&desc->lock);
+            continue;
         }
 
         if ( desc->arch.move_cleanup_count )
@@ -2586,7 +2582,12 @@ void fixup_irqs(const cpumask_t *mask, b
                                affinity);
         }
 
-        if ( !desc->action || cpumask_subset(desc->affinity, mask) )
+        /*
+         * Avoid shuffling the interrupt around as long as current target CPUs
+         * are a subset of the input mask.  What fixup_irqs() cares about is
+         * evacuating interrupts from CPUs not in the input mask.
+         */
+        if ( !desc->action || cpumask_subset(desc->arch.cpu_mask, mask) )
         {
             spin_unlock(&desc->lock);
             continue;

++++++ 666b07ee-x86-EPT-special-page-in-epte_get_entry_emt.patch ++++++
# Commit 5540b94e8191059eb9cbbe98ac316232a42208f6
# Date 2024-06-13 16:53:34 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/EPT: correct special page checking in epte_get_entry_emt()

mfn_valid() granularity is (currently) 256Mb. Therefore the start of a
1Gb page passing the test doesn't necessarily mean all parts of such a
range would also pass. Yet using the result of mfn_to_page() on an MFN
which doesn't pass mfn_valid() checking is liable to result in a crash
(the invocation of mfn_to_page() alone is presumably "just" UB in such a
case).

Fixes: ca24b2ffdbd9 ("x86/hvm: set 'ipat' in EPT for special pages")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -518,8 +518,12 @@ int epte_get_entry_emt(struct domain *d,
     }
 
     for ( special_pgs = i = 0; i < (1ul << order); i++ )
-        if ( is_special_page(mfn_to_page(mfn_add(mfn, i))) )
+    {
+        mfn_t cur = mfn_add(mfn, i);
+
+        if ( mfn_valid(cur) && is_special_page(mfn_to_page(cur)) )
             special_pgs++;
+    }
 
     if ( special_pgs )
     {

++++++ 666b0819-x86-EPT-avoid-marking-np-ents-for-reconfig.patch ++++++
# Commit 777c71d31325bc55ba1cc3f317d4155fe519ab0b
# Date 2024-06-13 16:54:17 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/EPT: avoid marking non-present entries for re-configuring

For non-present entries EMT, like most other fields, is meaningless to
hardware. Make the logic in ept_set_entry() setting the field (and iPAT)
conditional upon dealing with a present entry, leaving the value at 0
otherwise. This has two effects for epte_get_entry_emt() which we'll
want to leverage subsequently:
1) The call moved here now won't be issued with INVALID_MFN anymore (a
   respective BUG_ON() is being added).
2) Neither of the other two calls could now be issued with a truncated
   form of INVALID_MFN anymore (as long as there's no bug anywhere
   marking an entry present when that was populated using INVALID_MFN).

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -649,6 +649,8 @@ static int cf_check resolve_misconfig(st
             if ( e.emt != MTRR_NUM_TYPES )
                 break;
 
+            ASSERT(is_epte_present(&e));
+
             if ( level == 0 )
             {
                 for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
@@ -914,17 +916,6 @@ ept_set_entry(struct p2m_domain *p2m, gf
 
     if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
     {
-        bool ipat;
-        int emt = epte_get_entry_emt(p2m->domain, _gfn(gfn), mfn,
-                                     i * EPT_TABLE_ORDER, &ipat,
-                                     p2mt);
-
-        if ( emt >= 0 )
-            new_entry.emt = emt;
-        else /* ept_handle_misconfig() will need to take care of this. */
-            new_entry.emt = MTRR_NUM_TYPES;
-
-        new_entry.ipat = ipat;
         new_entry.sp = !!i;
         new_entry.sa_p2mt = p2mt;
         new_entry.access = p2ma;
@@ -940,6 +931,22 @@ ept_set_entry(struct p2m_domain *p2m, gf
             need_modify_vtd_table = 0;
 
         ept_p2m_type_to_flags(p2m, &new_entry);
+
+        if ( is_epte_present(&new_entry) )
+        {
+            bool ipat;
+            int emt = epte_get_entry_emt(p2m->domain, _gfn(gfn), mfn,
+                                         i * EPT_TABLE_ORDER, &ipat,
+                                         p2mt);
+
+            BUG_ON(mfn_eq(mfn, INVALID_MFN));
+
+            if ( emt >= 0 )
+                new_entry.emt = emt;
+            else /* ept_handle_misconfig() will need to take care of this. */
+                new_entry.emt = MTRR_NUM_TYPES;
+            new_entry.ipat = ipat;
+        }
     }
 
     if ( sve != -1 )

++++++ 666b085a-x86-EPT-drop-questionable-mfn_valid-from-.patch ++++++
# Commit 4fdd8d75566fdad06667a79ec0ce6f43cc466c54
# Date 2024-06-13 16:55:22 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/EPT: drop questionable mfn_valid() from epte_get_entry_emt()

mfn_valid() is RAM-focused; it will often return false for MMIO. Yet
access to actual MMIO space should not generally be restricted to UC
only; especially video frame buffer accesses are unduly affected by such
a restriction.

Since, as of 777c71d31325 ("x86/EPT: avoid marking non-present entries
for re-configuring"), the function won't be called with INVALID_MFN or,
worse, truncated forms thereof anymore, we call fully drop that check.

Fixes: 81fd0d3ca4b2 ("x86/hvm: simplify 'mmio_direct' check in 
epte_get_entry_emt()")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -500,12 +500,6 @@ int epte_get_entry_emt(struct domain *d,
         return -1;
     }
 
-    if ( !mfn_valid(mfn) )
-    {
-        *ipat = true;
-        return X86_MT_UC;
-    }
-
     /*
      * Conditional must be kept in sync with the code in
      * {iomem,ioports}_{permit,deny}_access().

++++++ 667187cc-x86-Intel-unlock-CPUID-earlier.patch ++++++
# Commit fa4d026737a47cd1d66ffb797a29150b4453aa9f
# Date 2024-06-18 15:12:44 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/Intel: unlock CPUID earlier for the BSP

Intel CPUs have a MSR bit to limit CPUID enumeration to leaf two. If
this bit is set by the BIOS then CPUID evaluation does not work when
data from any leaf greater than two is needed; early_cpu_init() in
particular wants to collect leaf 7 data.

Cure this by unlocking CPUID right before evaluating anything which
depends on the maximum CPUID leaf being greater than two.

Inspired by (and description cloned from) Linux commit 0c2f6d04619e
("x86/topology/intel: Unlock CPUID before evaluating anything").

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau MonnÃ© <roger....@citrix.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -336,7 +336,8 @@ void __init early_cpu_init(bool verbose)
 
        c->x86_vendor = x86_cpuid_lookup_vendor(ebx, ecx, edx);
        switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:    actual_cpu = intel_cpu_dev;    break;
+       case X86_VENDOR_INTEL:    intel_unlock_cpuid_leaves(c);
+                                 actual_cpu = intel_cpu_dev;    break;
        case X86_VENDOR_AMD:      actual_cpu = amd_cpu_dev;      break;
        case X86_VENDOR_CENTAUR:  actual_cpu = centaur_cpu_dev;  break;
        case X86_VENDOR_SHANGHAI: actual_cpu = shanghai_cpu_dev; break;
--- a/xen/arch/x86/cpu/cpu.h
+++ b/xen/arch/x86/cpu/cpu.h
@@ -24,3 +24,5 @@ void amd_init_lfence(struct cpuinfo_x86
 void amd_init_ssbd(const struct cpuinfo_x86 *c);
 void amd_init_spectral_chicken(void);
 void detect_zen2_null_seg_behaviour(void);
+
+void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c);
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -303,10 +303,24 @@ static void __init noinline intel_init_l
                ctxt_switch_masking = intel_ctxt_switch_masking;
 }
 
-static void cf_check early_init_intel(struct cpuinfo_x86 *c)
+/* Unmask CPUID levels if masked. */
+void intel_unlock_cpuid_leaves(struct cpuinfo_x86 *c)
 {
-       u64 misc_enable, disable;
+       uint64_t misc_enable, disable;
+
+       rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+
+       disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
+       if (disable) {
+               wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
+               bootsym(trampoline_misc_enable_off) |= disable;
+               c->cpuid_level = cpuid_eax(0);
+               printk(KERN_INFO "revised cpuid level: %u\n", c->cpuid_level);
+       }
+}
 
+static void cf_check early_init_intel(struct cpuinfo_x86 *c)
+{
        /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
        if (c->x86 == 15 && c->x86_cache_alignment == 64)
                c->x86_cache_alignment = 128;
@@ -315,16 +329,7 @@ static void cf_check early_init_intel(st
             bootsym(trampoline_misc_enable_off) & 
MSR_IA32_MISC_ENABLE_XD_DISABLE)
                printk(KERN_INFO "re-enabled NX (Execute Disable) 
protection\n");
 
-       /* Unmask CPUID levels and NX if masked: */
-       rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
-       disable = misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
-       if (disable) {
-               wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable & ~disable);
-               bootsym(trampoline_misc_enable_off) |= disable;
-               printk(KERN_INFO "revised cpuid level: %d\n",
-                      cpuid_eax(0));
-       }
+       intel_unlock_cpuid_leaves(c);
 
        /* CPUID workaround for Intel 0F33/0F34 CPU */
        if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 &&

++++++ 66718849-x86-IRQ-old_cpu_mask-in-fixup_irqs.patch ++++++

References: bsc#1214718

# Commit 817d1cd627be668c358d038f0fadbf7d24d417d3
# Date 2024-06-18 15:14:49 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/irq: deal with old_cpu_mask for interrupts in movement in fixup_irqs()

Given the current logic it's possible for ->arch.old_cpu_mask to get out of
sync: if a CPU set in old_cpu_mask is offlined and then onlined
again without old_cpu_mask having been updated the data in the mask will no
longer be accurate, as when brought back online the CPU will no longer have
old_vector configured to handle the old interrupt source.

If there's an interrupt movement in progress, and the to be offlined CPU (which
is the call context) is in the old_cpu_mask, clear it and update the mask, so
it doesn't contain stale data.

Note that when the system is going down fixup_irqs() will be called by
smp_send_stop() from CPU 0 with a mask with only CPU 0 on it, effectively
asking to move all interrupts to the current caller (CPU 0) which is the only
CPU to remain online.  In that case we don't care to migrate interrupts that
are in the process of being moved, as it's likely we won't be able to move all
interrupts to CPU 0 due to vector shortage anyway.

Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2539,7 +2539,7 @@ void fixup_irqs(const cpumask_t *mask, b
     for ( irq = 0; irq < nr_irqs; irq++ )
     {
         bool break_affinity = false, set_affinity = true;
-        unsigned int vector;
+        unsigned int vector, cpu = smp_processor_id();
         cpumask_t *affinity = this_cpu(scratch_cpumask);
 
         if ( irq == 2 )
@@ -2582,6 +2582,33 @@ void fixup_irqs(const cpumask_t *mask, b
                                affinity);
         }
 
+        if ( desc->arch.move_in_progress &&
+             /*
+              * Only attempt to adjust the mask if the current CPU is going
+              * offline, otherwise the whole system is going down and leaving
+              * stale data in the masks is fine.
+              */
+             !cpu_online(cpu) &&
+             cpumask_test_cpu(cpu, desc->arch.old_cpu_mask) )
+        {
+            /*
+             * This CPU is going offline, remove it from ->arch.old_cpu_mask
+             * and possibly release the old vector if the old mask becomes
+             * empty.
+             *
+             * Note cleaning ->arch.old_cpu_mask is required if the CPU is
+             * brought offline and then online again, as when re-onlined the
+             * per-cpu vector table will no longer have ->arch.old_vector
+             * setup, and hence ->arch.old_cpu_mask would be stale.
+             */
+            cpumask_clear_cpu(cpu, desc->arch.old_cpu_mask);
+            if ( cpumask_empty(desc->arch.old_cpu_mask) )
+            {
+                desc->arch.move_in_progress = 0;
+                release_old_vec(desc);
+            }
+        }
+
         /*
          * Avoid shuffling the interrupt around as long as current target CPUs
          * are a subset of the input mask.  What fixup_irqs() cares about is

++++++ 6671885e-x86-IRQ-handle-moving-in-_assign_irq_vector.patch ++++++

References: bsc#1214718

# Commit 369558924a642bbb0cb731e9a3375958867cb17b
# Date 2024-06-18 15:15:10 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/irq: handle moving interrupts in _assign_irq_vector()

Currently there's logic in fixup_irqs() that attempts to prevent
_assign_irq_vector() from failing, as fixup_irqs() is required to evacuate all
interrupts from the CPUs not present in the input mask.  The current logic in
fixup_irqs() is incomplete, as it doesn't deal with interrupts that have
move_cleanup_count > 0 and a non-empty ->arch.old_cpu_mask field.

Instead of attempting to fixup the interrupt descriptor in fixup_irqs() so that
_assign_irq_vector() cannot fail, introduce logic in _assign_irq_vector()
to deal with interrupts that have either move_{in_progress,cleanup_count} set
and no remaining online CPUs in ->arch.cpu_mask.

If _assign_irq_vector() is requested to move an interrupt in the state
described above, first attempt to see if ->arch.old_cpu_mask contains any valid
CPUs that could be used as fallback, and if that's the case do move the
interrupt back to the previous destination.  Note this is easier because the
vector hasn't been released yet, so there's no need to allocate and setup a new
vector on the destination.

Due to the logic in fixup_irqs() that clears offline CPUs from
->arch.old_cpu_mask (and releases the old vector if the mask becomes empty) it
shouldn't be possible to get into _assign_irq_vector() with
->arch.move_{in_progress,cleanup_count} set but no online CPUs in
->arch.old_cpu_mask.

However if ->arch.move_{in_progress,cleanup_count} is set and the interrupt has
also changed affinity, it's possible the members of ->arch.old_cpu_mask are no
longer part of the affinity set, move the interrupt to a different CPU part of
the provided mask and keep the current ->arch.old_{cpu_mask,vector} for the
pending interrupt movement to be completed.

Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -553,7 +553,58 @@ static int _assign_irq_vector(struct irq
     }
 
     if ( desc->arch.move_in_progress || desc->arch.move_cleanup_count )
-        return -EAGAIN;
+    {
+        /*
+         * If the current destination is online refuse to shuffle.  Retry after
+         * the in-progress movement has finished.
+         */
+        if ( cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map) )
+            return -EAGAIN;
+
+        /*
+         * Due to the logic in fixup_irqs() that clears offlined CPUs from
+         * ->arch.old_cpu_mask it shouldn't be possible to get here with
+         * ->arch.move_{in_progress,cleanup_count} set and no online CPUs in
+         * ->arch.old_cpu_mask.
+         */
+        ASSERT(valid_irq_vector(desc->arch.old_vector));
+        ASSERT(cpumask_intersects(desc->arch.old_cpu_mask, &cpu_online_map));
+
+        if ( cpumask_intersects(desc->arch.old_cpu_mask, mask) )
+        {
+            /*
+             * Fallback to the old destination if moving is in progress and the
+             * current destination is to be offlined.  This is only possible if
+             * the CPUs in old_cpu_mask intersect with the affinity mask passed
+             * in the 'mask' parameter.
+             */
+            desc->arch.vector = desc->arch.old_vector;
+            cpumask_and(desc->arch.cpu_mask, desc->arch.old_cpu_mask, mask);
+
+            /* Undo any possibly done cleanup. */
+            for_each_cpu(cpu, desc->arch.cpu_mask)
+                per_cpu(vector_irq, cpu)[desc->arch.vector] = irq;
+
+            /* Cancel the pending move and release the current vector. */
+            desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
+            cpumask_clear(desc->arch.old_cpu_mask);
+            desc->arch.move_in_progress = 0;
+            desc->arch.move_cleanup_count = 0;
+            if ( desc->arch.used_vectors )
+            {
+                ASSERT(test_bit(old_vector, desc->arch.used_vectors));
+                clear_bit(old_vector, desc->arch.used_vectors);
+            }
+
+            return 0;
+        }
+
+        /*
+         * There's an interrupt movement in progress but the destination(s) in
+         * ->arch.old_cpu_mask are not suitable given the 'mask' parameter, go
+         * through the full logic to find a new vector in a suitable CPU.
+         */
+    }
 
     err = -ENOSPC;
 
@@ -609,7 +660,22 @@ next:
         current_vector = vector;
         current_offset = offset;
 
-        if ( valid_irq_vector(old_vector) )
+        if ( desc->arch.move_in_progress || desc->arch.move_cleanup_count )
+        {
+            ASSERT(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
+            /*
+             * Special case when evacuating an interrupt from a CPU to be
+             * offlined and the interrupt was already in the process of being
+             * moved.  Leave ->arch.old_{vector,cpu_mask} as-is and just
+             * replace ->arch.{cpu_mask,vector} with the new destination.
+             * Cleanup will be done normally for the old fields, just release
+             * the current vector here.
+             */
+            if ( desc->arch.used_vectors &&
+                 !test_and_clear_bit(old_vector, desc->arch.used_vectors) )
+                ASSERT_UNREACHABLE();
+        }
+        else if ( valid_irq_vector(old_vector) )
         {
             cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
                         &cpu_online_map);
@@ -2620,33 +2686,6 @@ void fixup_irqs(const cpumask_t *mask, b
             continue;
         }
 
-        /*
-         * In order for the affinity adjustment below to be successful, we
-         * need _assign_irq_vector() to succeed. This in particular means
-         * clearing desc->arch.move_in_progress if this would otherwise
-         * prevent the function from succeeding. Since there's no way for the
-         * flag to get cleared anymore when there's no possible destination
-         * left (the only possibility then would be the IRQs enabled window
-         * after this loop), there's then also no race with us doing it here.
-         *
-         * Therefore the logic here and there need to remain in sync.
-         */
-        if ( desc->arch.move_in_progress &&
-             !cpumask_intersects(mask, desc->arch.cpu_mask) )
-        {
-            unsigned int cpu;
-
-            cpumask_and(affinity, desc->arch.old_cpu_mask, &cpu_online_map);
-
-            spin_lock(&vector_lock);
-            for_each_cpu(cpu, affinity)
-                per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
-            spin_unlock(&vector_lock);
-
-            release_old_vec(desc);
-            desc->arch.move_in_progress = 0;
-        }
-
         if ( !cpumask_intersects(mask, desc->affinity) )
         {
             break_affinity = true;

++++++ 6672c846-x86-xstate-initialisation-of-XSS-cache.patch ++++++
# Commit 9e6dbbe8bf400aacb99009ddffa91d2a0c312b39
# Date 2024-06-19 13:00:06 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/xstate: Fix initialisation of XSS cache

The clobbering of this_cpu(xcr0) and this_cpu(xss) to architecturally invalid
values is to force the subsequent set_xcr0() and set_msr_xss() to reload the
hardware register.

While XCR0 is reloaded in xstate_init(), MSR_XSS isn't.  This causes
get_msr_xss() to return the invalid value, and logic of the form:

    old = get_msr_xss();
    set_msr_xss(new);
    ...
    set_msr_xss(old);

to try and restore said invalid value.

The architecturally invalid value must be purged from the cache, meaning the
hardware register must be written at least once.  This in turn highlights that
the invalid value must only be used in the case that the hardware register is
available.

Fixes: f7f4a523927f ("x86/xstate: reset cached register values on resume")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -641,13 +641,6 @@ void xstate_init(struct cpuinfo_x86 *c)
         return;
     }
 
-    /*
-     * Zap the cached values to make set_xcr0() and set_msr_xss() really
-     * write it.
-     */
-    this_cpu(xcr0) = 0;
-    this_cpu(xss) = ~0;
-
     cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
     feature_mask = (((u64)edx << 32) | eax) & XCNTXT_MASK;
     BUG_ON(!valid_xcr0(feature_mask));
@@ -657,8 +650,19 @@ void xstate_init(struct cpuinfo_x86 *c)
      * Set CR4_OSXSAVE and run "cpuid" to get xsave_cntxt_size.
      */
     set_in_cr4(X86_CR4_OSXSAVE);
+
+    /*
+     * Zap the cached values to make set_xcr0() and set_msr_xss() really write
+     * the hardware register.
+     */
+    this_cpu(xcr0) = 0;
     if ( !set_xcr0(feature_mask) )
         BUG();
+    if ( cpu_has_xsaves )
+    {
+        this_cpu(xss) = ~0;
+        set_msr_xss(0);
+    }
 
     if ( bsp )
     {

++++++ 6672c847-x86-CPUID-XSAVE-dynamic-leaves.patch ++++++
# Commit 71cacfb035f4a78ee10970dc38a3baa04d387451
# Date 2024-06-19 13:00:06 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/cpuid: Fix handling of XSAVE dynamic leaves

[ This is a minimal backport of commit 71cacfb035f4 ("x86/cpuid: Fix handling
  of XSAVE dynamic leaves") to fix the bugs without depending on the large
  rework of XSTATE handling in Xen 4.19 ]

First, if XSAVE is available in hardware but not visible to the guest, the
dynamic leaves shouldn't be filled in.

Second, the comment concerning XSS state is wrong.  VT-x doesn't manage
host/guest state automatically, but there is provision for "host only" bits to
be set, so the implications are still accurate.

In Xen 4.18, no XSS states are supported, so it's safe to keep deferring to
real hardware.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -330,24 +330,20 @@ void guest_cpuid(const struct vcpu *v, u
     case XSTATE_CPUID:
         switch ( subleaf )
         {
-        case 1:
-            if ( p->xstate.xsavec || p->xstate.xsaves )
-            {
-                /*
-                 * TODO: Figure out what to do for XSS state.  VT-x manages
-                 * host vs guest MSR_XSS automatically, so as soon as we start
-                 * supporting any XSS states, the wrong XSS will be in
-                 * context.
-                 */
-                BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
-
-                /*
-                 * Read CPUID[0xD,0/1].EBX from hardware.  They vary with
-                 * enabled XSTATE, and appropraite XCR0|XSS are in context.
-                 */
+            /*
+             * Read CPUID[0xd,0/1].EBX from hardware.  They vary with enabled
+             * XSTATE, and the appropriate XCR0 is in context.
+             */
         case 0:
-                res->b = cpuid_count_ebx(leaf, subleaf);
-            }
+            if ( p->basic.xsave )
+                res->b = cpuid_count_ebx(0xd, 0);
+            break;
+
+        case 1:
+            /* This only works because Xen doesn't support XSS states yet. */
+            BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
+            if ( p->xstate.xsavec )
+                res->b = cpuid_count_ebx(0xd, 1);
             break;
         }
         break;

++++++ 6673ffdc-x86-IRQ-forward-pending-to-new-dest-in-fixup_irqs.patch ++++++

References: bsc#1214718

# Commit e2bb28d621584fce15c907002ddc7c6772644b64
# Date 2024-06-20 12:09:32 +0200
# Author Roger Pau MonnÃ© <roger....@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/irq: forward pending interrupts to new destination in fixup_irqs()

fixup_irqs() is used to evacuate interrupts from to be offlined CPUs.  Given
the CPU is to become offline, the normal migration logic used by Xen where the
vector in the previous target(s) is left configured until the interrupt is
received on the new destination is not suitable.

Instead attempt to do as much as possible in order to prevent loosing
interrupts.  If fixup_irqs() is called from the CPU to be offlined (as is
currently the case for CPU hot unplug) attempt to forward pending vectors when
interrupts that target the current CPU are migrated to a different destination.

Additionally, for interrupts that have already been moved from the current CPU
prior to the call to fixup_irqs() but that haven't been delivered to the new
destination (iow: interrupts with move_in_progress set and the current CPU set
in ->arch.old_cpu_mask) also check whether the previous vector is pending and
forward it to the new destination.

This allows us to remove the window with interrupts enabled at the bottom of
fixup_irqs().  Such window wasn't safe anyway: references to the CPU to become
offline are removed from interrupts masks, but the per-CPU vector_irq[] array
is not updated to reflect those changes (as the CPU is going offline anyway).

Signed-off-by: Roger Pau MonnÃ© <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/include/asm/apic.h
+++ b/xen/arch/x86/include/asm/apic.h
@@ -145,6 +145,11 @@ static __inline bool_t apic_isr_read(u8
             (vector & 0x1f)) & 1;
 }
 
+static inline bool apic_irr_read(unsigned int vector)
+{
+    return apic_read(APIC_IRR + (vector / 32 * 0x10)) & (1U << (vector % 32));
+}
+
 static __inline u32 get_apic_id(void) /* Get the physical APIC id */
 {
     u32 id = apic_read(APIC_ID);
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2604,7 +2604,7 @@ void fixup_irqs(const cpumask_t *mask, b
 
     for ( irq = 0; irq < nr_irqs; irq++ )
     {
-        bool break_affinity = false, set_affinity = true;
+        bool break_affinity = false, set_affinity = true, check_irr = false;
         unsigned int vector, cpu = smp_processor_id();
         cpumask_t *affinity = this_cpu(scratch_cpumask);
 
@@ -2658,6 +2658,25 @@ void fixup_irqs(const cpumask_t *mask, b
              cpumask_test_cpu(cpu, desc->arch.old_cpu_mask) )
         {
             /*
+             * This to be offlined CPU was the target of an interrupt that's
+             * been moved, and the new destination target hasn't yet
+             * acknowledged any interrupt from it.
+             *
+             * We know the interrupt is configured to target the new CPU at
+             * this point, so we can check IRR for any pending vectors and
+             * forward them to the new destination.
+             *
+             * Note that for the other case of an interrupt movement being in
+             * progress (move_cleanup_count being non-zero) we know the new
+             * destination has already acked at least one interrupt from this
+             * source, and hence there's no need to forward any stale
+             * interrupts.
+             */
+            if ( apic_irr_read(desc->arch.old_vector) )
+                send_IPI_mask(cpumask_of(cpumask_any(desc->arch.cpu_mask)),
+                              desc->arch.vector);
+
+            /*
              * This CPU is going offline, remove it from ->arch.old_cpu_mask
              * and possibly release the old vector if the old mask becomes
              * empty.
@@ -2697,6 +2716,14 @@ void fixup_irqs(const cpumask_t *mask, b
         if ( desc->handler->disable )
             desc->handler->disable(desc);
 
+        /*
+         * If the current CPU is going offline and is (one of) the target(s) of
+         * the interrupt, signal to check whether there are any pending vectors
+         * to be handled in the local APIC after the interrupt has been moved.
+         */
+        if ( !cpu_online(cpu) && cpumask_test_cpu(cpu, desc->arch.cpu_mask) )
+            check_irr = true;
+
         if ( desc->handler->set_affinity )
             desc->handler->set_affinity(desc, affinity);
         else if ( !(warned++) )
@@ -2707,6 +2734,18 @@ void fixup_irqs(const cpumask_t *mask, b
 
         cpumask_copy(affinity, desc->affinity);
 
+        if ( check_irr && apic_irr_read(vector) )
+            /*
+             * Forward pending interrupt to the new destination, this CPU is
+             * going offline and otherwise the interrupt would be lost.
+             *
+             * Do the IRR check as late as possible before releasing the irq
+             * desc in order for any in-flight interrupts to be delivered to
+             * the lapic.
+             */
+            send_IPI_mask(cpumask_of(cpumask_any(desc->arch.cpu_mask)),
+                          desc->arch.vector);
+
         spin_unlock(&desc->lock);
 
         if ( !verbose )
@@ -2718,11 +2757,6 @@ void fixup_irqs(const cpumask_t *mask, b
             printk("Broke affinity for IRQ%u, new: %*pb\n",
                    irq, CPUMASK_PR(affinity));
     }
-
-    /* That doesn't seem sufficient.  Give it 1ms. */
-    local_irq_enable();
-    mdelay(1);
-    local_irq_disable();
 }
 
 void fixup_eoi(void)

commit xen for openSUSE:Factory

Reply via email to