commit xen for openSUSE:Factory

root Fri, 25 Sep 2020 07:21:46 -0700

Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2020-09-25 16:21:02
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.4249 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "xen"

Fri Sep 25 16:21:02 2020 rev:291 rq:836146 version:4.14.0_08

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2020-09-18 14:28:08.459274689 
+0200
+++ /work/SRC/openSUSE:Factory/.xen.new.4249/xen.changes        2020-09-25 
16:21:30.839358028 +0200
@@ -1,0 +2,59 @@
+Tue Sep 22 10:54:28 MDT 2020 - carn...@suse.com
+
+- bsc#1176339 - VUL-0: CVE-2020-25602: xen: x86 pv: Crash when
+  handling guest access to MSR_MISC_ENABLE (XSA-333)
+  5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
+- bsc#1176341 - VUL-0: CVE-2020-25598: xen: Missing unlock in
+  XENMEM_acquire_resource error path (XSA-334)
+  5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
+- bsc#1176343 - VUL-0: CVE-2020-25604: xen: race when migrating
+  timers between x86 HVM vCPU-s (XSA-336)
+  5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
+- bsc#1176344 - VUL-0: CVE-2020-25595: xen: PCI passthrough code
+  reading back hardware registers (XSA-337)
+  5f6a05fa-msi-get-rid-of-read_msi_msg.patch
+  5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
+- bsc#1176346 - VUL-0: CVE-2020-25597: xen: once valid event
+  channels may not turn invalid (XSA-338)
+  5f6a062c-evtchn-relax-port_is_valid.patch
+- bsc#1176345 - VUL-0: CVE-2020-25596: xen: x86 pv guest kernel
+  DoS via SYSENTER (XSA-339)
+  5f6a065c-pv-Avoid-double-exception-injection.patch
+- bsc#1176347 - VUL-0: CVE-2020-25603: xen: Missing barrier
+  barriers when accessing/allocating an event channel (XSA-340)
+  
5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
+- bsc#1176348 - VUL-0: CVE-2020-25600: xen: out of bounds event
+  channels available to 32-bit x86 domains (XSA-342)
+  5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
+- bsc#1176349 - VUL-0: CVE-2020-25599: xen: races with
+  evtchn_reset() (XSA-343)
+  5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
+  5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
+  5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
+- bsc#1176350 - VUL-0: CVE-2020-25601: xen: lack of preemption in
+  evtchn_reset() / evtchn_destroy() (XSA-344)
+  5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
+  5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch
+- Upstream bug fix (bsc#1027519)
+  5f5b6951-x86-PV-64bit-segbase-consistency.patch
+
+-------------------------------------------------------------------
+Mon Sep 21 14:03:02 MDT 2020 - carn...@suse.com
+
+- Fix problems in xen.spec with building on aarch64
+
+-------------------------------------------------------------------
+Fri Sep 18 15:20:31 MDT 2020 - carn...@suse.com
+
+- Make use of %service_del_postun_without_restart while preserving
+  the old behavior for older distros.
+- In %post tools, remove unnecessary qemu symlinks.
+
+-------------------------------------------------------------------
+Thu Sep 17 11:11:11 UTC 2020 - oher...@suse.de
+
+- Fix error in xen-tools %post when linking pvgrub64.bin
+- Make paths below libexec more explicit
+- Create symlink also for pvgrub32.bin
+
+-------------------------------------------------------------------

New:
----
  5f5b6951-x86-PV-64bit-segbase-consistency.patch
  5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
  5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
  5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
  5f6a05fa-msi-get-rid-of-read_msi_msg.patch
  5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
  5f6a062c-evtchn-relax-port_is_valid.patch
  5f6a065c-pv-Avoid-double-exception-injection.patch
  
5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
  5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
  5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
  5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
  5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
  5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
  5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.juoBW6/_old  2020-09-25 16:21:32.367359381 +0200
+++ /var/tmp/diff_new_pack.juoBW6/_new  2020-09-25 16:21:32.371359385 +0200
@@ -38,6 +38,7 @@
 %bcond_with    xen_stubdom
 %endif
 #
+%define qemu_arch i386
 %ifarch x86_64
 %define with_gdbsx 1
 %define with_dom0_support 1
@@ -45,6 +46,7 @@
 #
 %ifarch %arm aarch64
 %define with_dom0_support 1
+%define qemu_arch aarch64
 %endif
 #
 %define xen_install_suffix %{nil}
@@ -123,7 +125,7 @@
 BuildRequires:  pesign-obs-integration
 %endif
 
-Version:        4.14.0_06
+Version:        4.14.0_08
 Release:        0
 Summary:        Xen Virtualization: Hypervisor (aka VMM aka Microkernel)
 License:        GPL-2.0-only
@@ -163,6 +165,21 @@
 # Upstream patches
 Patch1:         5f1a9916-x86-S3-put-data-sregs-into-known-state.patch
 Patch2:         5f21b9fd-x86-cpuid-APIC-bit-clearing.patch
+Patch3:         5f5b6951-x86-PV-64bit-segbase-consistency.patch
+Patch4:         
5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch
+Patch5:         
5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch
+Patch6:         5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch
+Patch7:         5f6a05fa-msi-get-rid-of-read_msi_msg.patch
+Patch8:         
5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch
+Patch9:         5f6a062c-evtchn-relax-port_is_valid.patch
+Patch10:        5f6a065c-pv-Avoid-double-exception-injection.patch
+Patch11:        
5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
+Patch12:        
5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch
+Patch13:        
5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch
+Patch14:        5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch
+Patch15:        5f6a06f2-evtchn-address-races-with-evtchn_reset.patch
+Patch16:        5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch
+Patch17:        5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch
 # Our platform specific patches
 Patch400:       xen-destdir.patch
 Patch401:       vif-bridge-no-iptables.patch
@@ -392,6 +409,21 @@
 # Upstream patches
 %patch1 -p1
 %patch2 -p1
+%patch3 -p1
+%patch4 -p1
+%patch5 -p1
+%patch6 -p1
+%patch7 -p1
+%patch8 -p1
+%patch9 -p1
+%patch10 -p1
+%patch11 -p1
+%patch12 -p1
+%patch13 -p1
+%patch14 -p1
+%patch15 -p1
+%patch16 -p1
+%patch17 -p1
 # Our platform specific patches
 %patch400 -p1
 %patch401 -p1
@@ -478,8 +510,10 @@
 export GIT=$(type -P false)
 %ifarch aarch64
 # GCC10+ enables outline-atomics option by default and breaks the build, so 
disable it
+%if 0%{?suse_version} >= 1550
 export CFLAGS="%{optflags} -mno-outline-atomics"
 %endif
+%endif
 export EXTRA_CFLAGS_XEN_TOOLS="%{optflags}"
 export EXTRA_CFLAGS_QEMU_TRADITIONAL="%{optflags}"
 export SMBIOS_REL_DATE="$SMBIOS_REL_DATE"
@@ -505,13 +539,15 @@
 then
        : no changes?
 fi
+
 configure_flags=
+configure_flags="--with-system-qemu=%{_bindir}/qemu-system-%{qemu_arch}"
 %if %{with xen_stubdom}
-configure_flags=--enable-stubdom
+configure_flags="${configure_flags} --enable-stubdom"
 %else
 # change the/our default to daemon due to lack of stubdom
 sed -i~ 's/ XENSTORETYPE=domain$/ XENSTORETYPE=daemon/' 
tools/hotplug/Linux/launch-xenstore.in
-configure_flags=--disable-stubdom
+configure_flags="${configure_flags} --disable-stubdom"
 %endif
 export PYTHON="/usr/bin/python3"
 configure_flags="${configure_flags} --disable-qemu-traditional"
@@ -525,6 +561,7 @@
         --sbindir=%{_sbindir} \
         --libdir=%{_libdir} \
         --libexecdir=%{_libexecdir} \
+        --with-libexec-leaf-dir=%{name} \
         --datadir=%{_datadir} \
         --mandir=%{_mandir} \
         --includedir=%{_includedir} \
@@ -540,7 +577,6 @@
        --with-systemd-modules-load=%{with_systemd_modules_load} \
        --with-system-ovmf=%{_datadir}/qemu/ovmf-x86_64-ms.bin \
        --with-system-seabios=%{_datadir}/qemu/bios-256k.bin \
-       --with-system-qemu=%{_bindir}/qemu-system-i386 \
         ${configure_flags}
 make -C tools/include/xen-foreign %{?_smp_mflags}
 make %{?_smp_mflags}
@@ -798,15 +834,15 @@
 # preserve the path. For x86_64, create a simple wrapper that invokes
 # /usr/bin/qemu-system-i386
 # Using qemu-system-x86_64 will result in an incompatible VM
-%ifarch x86_64
+%ifarch x86_64 aarch64
 hardcoded_path_in_existing_domU_xml='/usr/lib/xen/bin'
 mkdir -vp %{buildroot}${hardcoded_path_in_existing_domU_xml}
-tee %{buildroot}${hardcoded_path_in_existing_domU_xml}/qemu-system-i386 << 
'EOF'
+tee 
%{buildroot}${hardcoded_path_in_existing_domU_xml}/qemu-system-%{qemu_arch} << 
'EOF'
 #!/bin/sh
 
-exec %{_bindir}/qemu-system-i386 "$@"
+exec %{_bindir}/qemu-system-%{qemu_arch} "$@"
 EOF
-chmod 0755 %{buildroot}${hardcoded_path_in_existing_domU_xml}/qemu-system-i386
+chmod 0755 
%{buildroot}${hardcoded_path_in_existing_domU_xml}/qemu-system-%{qemu_arch}
 #
 unit='%{_libexecdir}/%{name}/bin/xendomains-wait-disks'
 mkdir -vp '%{buildroot}%{_libexecdir}/%{name}/bin'
@@ -947,7 +983,7 @@
 # 32 bit hypervisor no longer supported.  Remove dom0 tools.
 rm -rf %{buildroot}/%{_datadir}/doc
 rm -rf %{buildroot}/%{_datadir}/man
-rm -rf %{buildroot}/%{_libexecdir}/xen
+rm -rf %{buildroot}/%{_libexecdir}/%{name}
 rm -rf %{buildroot}/%{_libdir}/python*
 rm -rf %{buildroot}/%{_libdir}/ocaml*
 rm -rf %{buildroot}/%{_unitdir}
@@ -1059,8 +1095,8 @@
 /usr/lib/supportconfig/plugins/xen
 %dir /usr/lib/xen
 %dir /usr/lib/xen/bin
-/usr/lib/xen/bin/qemu-system-i386
-%{_libexecdir}/xen
+/usr/lib/xen/bin/qemu-system-%{qemu_arch}
+%{_libexecdir}/%{name}
 %exclude %{_libexecdir}/%{name}-tools-domU
 %ifarch x86_64
 %exclude %{_libexecdir}/%{name}/bin/xendomains-wait-disks
@@ -1253,32 +1289,17 @@
 %service_add_post xen-init-dom0.service
 %service_add_post xen-qemu-dom0-disk-backend.service
 
-if [ -f /usr/bin/qemu-img ]; then
-    if [ -f /usr/bin/qemu-img-xen ]; then
-        rm /usr/bin/qemu-img-xen
-    fi
-    rm -f %{_libexecdir}/xen/bin/qemu-img-xen
-    ln -s /usr/bin/qemu-img %{_libexecdir}/xen/bin/qemu-img-xen
-fi
-if [ -f /usr/bin/qemu-nbd ]; then
-    if [ -f /usr/bin/qemu-nbd-xen ]; then
-        rm /usr/bin/qemu-nbd-xen
-    fi
-    rm -f %{_libexecdir}/xen/bin/qemu-nbd-xen
-    ln -s /usr/bin/qemu-nbd %{_libexecdir}/xen/bin/qemu-nbd-xen
-fi
-if [ -f /usr/bin/qemu-io ]; then
-    rm -f %{_libexecdir}/xen/bin/qemu-io-xen
-    ln -s /usr/bin/qemu-io %{_libexecdir}/xen/bin/qemu-io-xen
-fi
 if [ -f /etc/default/grub ] && ! (/usr/bin/grep GRUB_CMDLINE_XEN 
/etc/default/grub >/dev/null); then
     echo '# Xen boot parameters for all Xen boots' >> /etc/default/grub
     echo 'GRUB_CMDLINE_XEN=""' >> /etc/default/grub
     echo '# Xen boot parameters for non-recovery Xen boots (in addition to 
GRUB_CMDLINE_XEN)' >> /etc/default/grub
     echo 'GRUB_CMDLINE_XEN_DEFAULT=""' >> /etc/default/grub
 fi
-if [ -f /usr/lib/grub2/x86_64-xen/grub.xen -a ! -f 
/usr/lib/xen/boot/pvgrub64.bin ]; then
-    ln -s /usr/lib/grub2/x86_64-xen/grub.xen /usr/lib/xen/boot/pvgrub64.bin
+if [ -f %{_datadir}/grub2/i386-xen/grub.xen ] && [ ! -f 
%{_libexecdir}/%{name}/boot/pvgrub32.bin ]; then
+ ln -sv %{_datadir}/grub2/i386-xen/grub.xen             
%{_libexecdir}/%{name}/boot/pvgrub32.bin
+fi
+if [ -f %{_datadir}/grub2/x86_64-xen/grub.xen ] && [ ! -f 
%{_libexecdir}/%{name}/boot/pvgrub64.bin ]; then
+ ln -sv %{_datadir}/grub2/x86_64-xen/grub.xen             
%{_libexecdir}/%{name}/boot/pvgrub64.bin
 fi
 
 %preun tools
@@ -1292,6 +1313,16 @@
 %service_del_preun xen-qemu-dom0-disk-backend.service
 
 %postun tools
+%if %{defined service_del_postun_without_restart}
+%service_del_postun_without_restart xencommons.service
+%service_del_postun_without_restart xendomains.service
+%service_del_postun_without_restart xen-watchdog.service
+%service_del_postun_without_restart xenstored.service
+%service_del_postun_without_restart xen-dom0-modules.service
+%service_del_postun_without_restart xenconsoled.service
+%service_del_postun_without_restart xen-init-dom0.service
+%service_del_postun_without_restart xen-qemu-dom0-disk-backend.service
+%else
 export DISABLE_RESTART_ON_UPDATE=yes
 %service_del_postun xencommons.service
 %service_del_postun xendomains.service
@@ -1301,6 +1332,7 @@
 %service_del_postun xenconsoled.service
 %service_del_postun xen-init-dom0.service
 %service_del_postun xen-qemu-dom0-disk-backend.service
+%endif
 
 %endif
 

++++++ 5f5b6951-x86-PV-64bit-segbase-consistency.patch ++++++
Subject: x86/pv: Fix consistency of 64bit segment bases
From: Andrew Cooper andrew.coop...@citrix.com Fri Sep 11 14:10:57 2020 +0200
Date: Fri Sep 11 14:10:57 2020 +0200:
Git: 431d52afd9438a3a126dfd787bd2d69b76906cb5

The comments in save_segments(), _toggle_guest_pt() and write_cr() are false.
The %fs and %gs bases can be updated at any time by the guest.

As a consequence, Xen's fs_base/etc tracking state is always stale when the
vcpu is in context, and must not be used to complete MSR_{FS,GS}_BASE reads, 
etc.

In particular, a sequence such as:

  wrmsr(MSR_FS_BASE, 0x1ull << 32);
  write_fs(__USER_DS);
  base = rdmsr(MSR_FS_BASE);

will return the stale base, not the new base.  This may cause guest a guest
kernel's context switching of userspace to malfunction.

Therefore:
 * Update save_segments(), _toggle_guest_pt() and read_msr() to always read
   the segment bases from hardware.
 * Update write_cr(), write_msr() and do_set_segment_base() to not not waste
   time caching data which is instantly going to become stale again.
 * Provide comments explaining when the tracking state is and isn't stale.

This bug has been present for 14 years, but several bugfixes since have built
on and extended the original flawed logic.

Fixes: ba9adb737ba ("Apply stricter checking to RDMSR/WRMSR emulations.")
Fixes: c42494acb2f ("x86: fix FS/GS base handling when using the fsgsbase 
feature")
Fixed: eccc170053e ("x86/pv: Don't have %cr4.fsgsbase active behind a guest 
kernels back")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
master commit: a5eaac9245f4f382a3cd0e9710e9d1cba7db20e4
master date: 2020-09-07 11:32:34 +0100

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index fee6c3931a..8351391cdd 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1562,6 +1562,16 @@ static void load_segments(struct vcpu *n)
     }
 }
 
+/*
+ * Record all guest segment state.  The guest can load segment selectors
+ * without trapping, which will also alter the 64bit FS/GS bases.  Arbitrary
+ * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
+ * enabled.
+ *
+ * Guests however cannot use SWAPGS, so there is no mechanism to modify the
+ * inactive GS base behind Xen's back.  Therefore, Xen's copy of the inactive
+ * GS base is still accurate, and doesn't need reading back from hardware.
+ */
 static void save_segments(struct vcpu *v)
 {
     struct cpu_user_regs *regs = &v->arch.user_regs;
@@ -1572,14 +1582,15 @@ static void save_segments(struct vcpu *v)
     regs->fs = read_sreg(fs);
     regs->gs = read_sreg(gs);
 
-    /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
-    if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) )
+    if ( !is_pv_32bit_vcpu(v) )
     {
-        v->arch.pv.fs_base = __rdfsbase();
+        unsigned long gs_base = rdgsbase();
+
+        v->arch.pv.fs_base = rdfsbase();
         if ( v->arch.flags & TF_kernel_mode )
-            v->arch.pv.gs_base_kernel = __rdgsbase();
+            v->arch.pv.gs_base_kernel = gs_base;
         else
-            v->arch.pv.gs_base_user = __rdgsbase();
+            v->arch.pv.gs_base_user = gs_base;
     }
 
     if ( regs->ds )
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index ec5a7d2dca..44e4ea2582 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -444,17 +444,19 @@ static void _toggle_guest_pt(struct vcpu *v)
 void toggle_guest_mode(struct vcpu *v)
 {
     const struct domain *d = v->domain;
+    unsigned long gs_base;
 
     ASSERT(!is_pv_32bit_vcpu(v));
 
-    /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
-    if ( read_cr4() & X86_CR4_FSGSBASE )
-    {
-        if ( v->arch.flags & TF_kernel_mode )
-            v->arch.pv.gs_base_kernel = __rdgsbase();
-        else
-            v->arch.pv.gs_base_user = __rdgsbase();
-    }
+    /*
+     * Update the cached value of the GS base about to become inactive, as a
+     * subsequent context switch won't bother re-reading it.
+     */
+    gs_base = rdgsbase();
+    if ( v->arch.flags & TF_kernel_mode )
+        v->arch.pv.gs_base_kernel = gs_base;
+    else
+        v->arch.pv.gs_base_user = gs_base;
     asm volatile ( "swapgs" );
 
     _toggle_guest_pt(v);
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 254da2b849..7c21076dd0 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -801,17 +801,6 @@ static int write_cr(unsigned int reg, unsigned long val,
     }
 
     case 4: /* Write CR4 */
-        /*
-         * If this write will disable FSGSBASE, refresh Xen's idea of the
-         * guest bases now that they can no longer change.
-         */
-        if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) &&
-             !(val & X86_CR4_FSGSBASE) )
-        {
-            curr->arch.pv.fs_base = __rdfsbase();
-            curr->arch.pv.gs_base_kernel = __rdgsbase();
-        }
-
         curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
         write_cr4(pv_make_cr4(curr));
         ctxt_switch_levelling(curr);
@@ -860,15 +849,13 @@ static int read_msr(unsigned int reg, uint64_t *val,
     case MSR_FS_BASE:
         if ( is_pv_32bit_domain(currd) )
             break;
-        *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
-                                               : curr->arch.pv.fs_base;
+        *val = rdfsbase();
         return X86EMUL_OKAY;
 
     case MSR_GS_BASE:
         if ( is_pv_32bit_domain(currd) )
             break;
-        *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
-                                               : curr->arch.pv.gs_base_kernel;
+        *val = rdgsbase();
         return X86EMUL_OKAY;
 
     case MSR_SHADOW_GS_BASE:
@@ -997,14 +984,12 @@ static int write_msr(unsigned int reg, uint64_t val,
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
             break;
         wrfsbase(val);
-        curr->arch.pv.fs_base = val;
         return X86EMUL_OKAY;
 
     case MSR_GS_BASE:
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
             break;
         wrgsbase(val);
-        curr->arch.pv.gs_base_kernel = val;
         return X86EMUL_OKAY;
 
     case MSR_SHADOW_GS_BASE:
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index 3b726f7c00..48fd60a876 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1002,10 +1002,7 @@ long do_set_segment_base(unsigned int which, unsigned 
long base)
     {
     case SEGBASE_FS:
         if ( is_canonical_address(base) )
-        {
             wrfsbase(base);
-            v->arch.pv.fs_base = base;
-        }
         else
             ret = -EINVAL;
         break;
@@ -1022,10 +1019,7 @@ long do_set_segment_base(unsigned int which, unsigned 
long base)
 
     case SEGBASE_GS_KERNEL:
         if ( is_canonical_address(base) )
-        {
             wrgsbase(base);
-            v->arch.pv.gs_base_kernel = base;
-        }
         else
             ret = -EINVAL;
         break;
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 6fd94c2e14..40385f5eaa 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -516,7 +516,24 @@ struct pv_vcpu
     bool_t syscall32_disables_events;
     bool_t sysenter_disables_events;
 
-    /* Segment base addresses. */
+    /*
+     * 64bit segment bases.
+     *
+     * FS and the active GS are always stale when the vCPU is in context, as
+     * the guest can change them behind Xen's back with MOV SREG, or
+     * WR{FS,GS}BASE on capable hardware.
+     *
+     * The inactive GS base is never stale, as guests can't use SWAPGS to
+     * access it - all modification is performed by Xen either directly
+     * (hypercall, #GP emulation), or indirectly (toggle_guest_mode()).
+     *
+     * The vCPU context switch path is optimised based on this fact, so any
+     * path updating or swapping the inactive base must update the cached
+     * value as well.
+     *
+     * Which GS base is active and inactive depends on whether the vCPU is in
+     * user or kernel context.
+     */
     unsigned long fs_base;
     unsigned long gs_base_kernel;
     unsigned long gs_base_user;
++++++ 5f6a05a0-pv-Handle-the-Intel-specific-MSR_MISC_ENABLE-correctly.patch 
++++++
Subject: x86/pv: Handle the Intel-specific MSR_MISC_ENABLE correctly
From: Andrew Cooper andrew.coop...@citrix.com Tue Sep 22 16:09:36 2020 +0200
Date: Tue Sep 22 16:09:36 2020 +0200:
Git: b04d6731eedd639e078e0f0d8147c6b156875ac3

This MSR doesn't exist on AMD hardware, and switching away from the safe
functions in the common MSR path was an erroneous change.

Partially revert the change.

This is XSA-333.

Fixes: 4fdc932b3cc ("x86/Intel: drop another 32-bit leftover")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Wei Liu <w...@xen.org>

diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 7c21076dd0..85a9fd4767 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -913,7 +913,8 @@ static int read_msr(unsigned int reg, uint64_t *val,
         return X86EMUL_OKAY;
 
     case MSR_IA32_MISC_ENABLE:
-        rdmsrl(reg, *val);
+        if ( rdmsr_safe(reg, *val) )
+            break;
         *val = guest_misc_enable(*val);
         return X86EMUL_OKAY;
 
@@ -1053,7 +1054,8 @@ static int write_msr(unsigned int reg, uint64_t val,
         break;
 
     case MSR_IA32_MISC_ENABLE:
-        rdmsrl(reg, temp);
+        if ( rdmsr_safe(reg, temp) )
+            break;
         if ( val != guest_misc_enable(temp) )
             goto invalid;
         return X86EMUL_OKAY;
++++++ 
5f6a05b7-xen-memory-Dont-skip-the-RCU-unlock-path-in-acquire_resource.patch 
++++++
Subject: xen/memory: Don't skip the RCU unlock path in acquire_resource()
From: Andrew Cooper andrew.coop...@citrix.com Tue Sep 22 16:09:59 2020 +0200
Date: Tue Sep 22 16:09:59 2020 +0200:
Git: 5eab5f0543e4f5fc52f7e2d823a29a6b1567fc16

In the case that an HVM Stubdomain makes an XENMEM_acquire_resource hypercall,
the FIXME path will bypass rcu_unlock_domain() on the way out of the function.

Move the check to the start of the function.  This does change the behaviour
of the get-size path for HVM Stubdomains, but that functionality is currently
broken and unused anyway, as well as being quite useless to entities which
can't actually map the resource anyway.

This is XSA-334.

Fixes: 83fa6552ce ("common: add a new mappable resource type: 
XENMEM_resource_grant_table")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

diff --git a/xen/common/memory.c b/xen/common/memory.c
index 714077c1e5..1bab0e80c2 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1058,6 +1058,14 @@ static int acquire_resource(
     xen_pfn_t mfn_list[32];
     int rc;
 
+    /*
+     * FIXME: Until foreign pages inserted into the P2M are properly
+     *        reference counted, it is unsafe to allow mapping of
+     *        resource pages unless the caller is the hardware domain.
+     */
+    if ( paging_mode_translate(currd) && !is_hardware_domain(currd) )
+        return -EACCES;
+
     if ( copy_from_guest(&xmar, arg, 1) )
         return -EFAULT;
 
@@ -1114,14 +1122,6 @@ static int acquire_resource(
         xen_pfn_t gfn_list[ARRAY_SIZE(mfn_list)];
         unsigned int i;
 
-        /*
-         * FIXME: Until foreign pages inserted into the P2M are properly
-         *        reference counted, it is unsafe to allow mapping of
-         *        resource pages unless the caller is the hardware domain.
-         */
-        if ( !is_hardware_domain(currd) )
-            return -EACCES;
-
         if ( copy_from_guest(gfn_list, xmar.frame_list, xmar.nr_frames) )
             rc = -EFAULT;
 
++++++ 5f6a05dd-vpt-fix-race-when-migrating-timers-between-vCPUs.patch ++++++
Subject: x86/vpt: fix race when migrating timers between vCPUs
From: Roger Pau Monné roger....@citrix.com Tue Sep 22 16:10:37 2020 +0200
Date: Tue Sep 22 16:10:37 2020 +0200:
Git: fc8200a6ad5248d8fb6b0fbf3f3dc80f29dfe0c7

The current vPT code will migrate the emulated timers between vCPUs
(change the pt->vcpu field) while just holding the destination lock,
either from create_periodic_time or pt_adjust_global_vcpu_target if
the global target is adjusted. Changing the periodic_timer vCPU field
in this way creates a race where a third party could grab the lock in
the unlocked region of pt_adjust_global_vcpu_target (or before
create_periodic_time performs the vcpu change) and then release the
lock from a different vCPU, creating a locking imbalance.

Introduce a per-domain rwlock in order to protect periodic_time
migration between vCPU lists. Taking the lock in read mode prevents
any timer from being migrated to a different vCPU, while taking it in
write mode allows performing migration of timers across vCPUs. The
per-vcpu locks are still used to protect all the other fields from the
periodic_timer struct.

Note that such migration shouldn't happen frequently, and hence
there's no performance drop as a result of such locking.

This is XSA-336.

Reported-by: Igor Druzhinin <igor.druzhi...@citrix.com>
Tested-by: Igor Druzhinin <igor.druzhi...@citrix.com>
Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 5bb47583b3..f965ca3155 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -658,6 +658,8 @@ int hvm_domain_initialise(struct domain *d)
     /* need link to containing domain */
     d->arch.hvm.pl_time->domain = d;
 
+    rwlock_init(&d->arch.hvm.pl_time->pt_migrate);
+
     /* Set the default IO Bitmap. */
     if ( is_hardware_domain(d) )
     {
diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index 47f2c2aa64..d64467b631 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -153,23 +153,32 @@ static int pt_irq_masked(struct periodic_time *pt)
     return 1;
 }
 
-static void pt_lock(struct periodic_time *pt)
+static void pt_vcpu_lock(struct vcpu *v)
 {
-    struct vcpu *v;
+    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&v->arch.hvm.tm_lock);
+}
 
-    for ( ; ; )
-    {
-        v = pt->vcpu;
-        spin_lock(&v->arch.hvm.tm_lock);
-        if ( likely(pt->vcpu == v) )
-            break;
-        spin_unlock(&v->arch.hvm.tm_lock);
-    }
+static void pt_vcpu_unlock(struct vcpu *v)
+{
+    spin_unlock(&v->arch.hvm.tm_lock);
+    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
+}
+
+static void pt_lock(struct periodic_time *pt)
+{
+    /*
+     * We cannot use pt_vcpu_lock here, because we need to acquire the
+     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
+     * else we might be using a stale value of pt->vcpu.
+     */
+    read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+    spin_lock(&pt->vcpu->arch.hvm.tm_lock);
 }
 
 static void pt_unlock(struct periodic_time *pt)
 {
-    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+    pt_vcpu_unlock(pt->vcpu);
 }
 
 static void pt_process_missed_ticks(struct periodic_time *pt)
@@ -219,7 +228,7 @@ void pt_save_timer(struct vcpu *v)
     if ( v->pause_flags & VPF_blocked )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         if ( !pt->do_not_freeze )
@@ -227,7 +236,7 @@ void pt_save_timer(struct vcpu *v)
 
     pt_freeze_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void pt_restore_timer(struct vcpu *v)
@@ -235,7 +244,7 @@ void pt_restore_timer(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
     {
@@ -248,7 +257,7 @@ void pt_restore_timer(struct vcpu *v)
 
     pt_thaw_time(v);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 static void pt_timer_fn(void *data)
@@ -309,7 +318,7 @@ int pt_update_irq(struct vcpu *v)
     int irq, pt_vector = -1;
     bool level;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     earliest_pt = NULL;
     max_lag = -1ULL;
@@ -339,7 +348,7 @@ int pt_update_irq(struct vcpu *v)
 
     if ( earliest_pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return -1;
     }
 
@@ -347,7 +356,7 @@ int pt_update_irq(struct vcpu *v)
     irq = earliest_pt->irq;
     level = earliest_pt->level;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     switch ( earliest_pt->source )
     {
@@ -394,7 +403,7 @@ int pt_update_irq(struct vcpu *v)
                 time_cb *cb = NULL;
                 void *cb_priv;
 
-                spin_lock(&v->arch.hvm.tm_lock);
+                pt_vcpu_lock(v);
                 /* Make sure the timer is still on the list. */
                 list_for_each_entry ( pt, &v->arch.hvm.tm_list, list )
                     if ( pt == earliest_pt )
@@ -404,7 +413,7 @@ int pt_update_irq(struct vcpu *v)
                         cb_priv = pt->priv;
                         break;
                     }
-                spin_unlock(&v->arch.hvm.tm_lock);
+                pt_vcpu_unlock(v);
 
                 if ( cb != NULL )
                     cb(v, cb_priv);
@@ -441,12 +450,12 @@ void pt_intr_post(struct vcpu *v, struct hvm_intack 
intack)
     if ( intack.source == hvm_intsrc_vector )
         return;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     pt = is_pt_irq(v, intack);
     if ( pt == NULL )
     {
-        spin_unlock(&v->arch.hvm.tm_lock);
+        pt_vcpu_unlock(v);
         return;
     }
 
@@ -455,7 +464,7 @@ void pt_intr_post(struct vcpu *v, struct hvm_intack intack)
     cb = pt->cb;
     cb_priv = pt->priv;
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 
     if ( cb != NULL )
         cb(v, cb_priv);
@@ -466,12 +475,12 @@ void pt_migrate(struct vcpu *v)
     struct list_head *head = &v->arch.hvm.tm_list;
     struct periodic_time *pt;
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    pt_vcpu_lock(v);
 
     list_for_each_entry ( pt, head, list )
         migrate_timer(&pt->timer, v->processor);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    pt_vcpu_unlock(v);
 }
 
 void create_periodic_time(
@@ -490,7 +499,7 @@ void create_periodic_time(
 
     destroy_periodic_time(pt);
 
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
 
     pt->pending_intr_nr = 0;
     pt->do_not_freeze = 0;
@@ -540,7 +549,7 @@ void create_periodic_time(
     init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
     set_timer(&pt->timer, pt->scheduled);
 
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void destroy_periodic_time(struct periodic_time *pt)
@@ -565,30 +574,20 @@ void destroy_periodic_time(struct periodic_time *pt)
 
 static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
 {
-    int on_list;
-
     ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
 
     if ( pt->vcpu == NULL )
         return;
 
-    pt_lock(pt);
-    on_list = pt->on_list;
-    if ( pt->on_list )
-        list_del(&pt->list);
-    pt->on_list = 0;
-    pt_unlock(pt);
-
-    spin_lock(&v->arch.hvm.tm_lock);
+    write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
     pt->vcpu = v;
-    if ( on_list )
+    if ( pt->on_list )
     {
-        pt->on_list = 1;
+        list_del(&pt->list);
         list_add(&pt->list, &v->arch.hvm.tm_list);
-
         migrate_timer(&pt->timer, v->processor);
     }
-    spin_unlock(&v->arch.hvm.tm_lock);
+    write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 void pt_adjust_global_vcpu_target(struct vcpu *v)
diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
index f0e0eaec83..39d26cbda4 100644
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -128,6 +128,13 @@ struct pl_time {    /* platform time */
     struct RTCState  vrtc;
     struct HPETState vhpet;
     struct PMTState  vpmt;
+    /*
+     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
+     * mode in order to prevent the vcpu field of periodic_time from changing.
+     * Lock must be taken in write mode when changes to the vcpu field are
+     * performed, as it allows exclusive access to all the timers of a domain.
+     */
+    rwlock_t pt_migrate;
     /* guest_time = Xen sys time + stime_offset */
     int64_t stime_offset;
     /* Ensures monotonicity in appropriate timer modes. */
++++++ 5f6a05fa-msi-get-rid-of-read_msi_msg.patch ++++++
Subject: x86/msi: get rid of read_msi_msg
From: Roger Pau Monné roger....@citrix.com Tue Sep 22 16:11:06 2020 +0200
Date: Tue Sep 22 16:11:06 2020 +0200:
Git: 5ad31525c9aeed6d3a349799fe491b5a7c678049

It's safer and faster to just use the cached last written
(untranslated) MSI message stored in msi_desc for the single user that
calls read_msi_msg.

This also prevents relying on the data read from the device MSI
registers in order to figure out the index into the IOMMU interrupt
remapping table, which is not safe.

This is part of XSA-337.

Reported-by: Andrew Cooper <andrew.coop...@citrix.com>
Requested-by: Andrew Cooper <andrew.coop...@citrix.com>
Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 161ee60dbe..6a00bc9312 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -185,54 +185,6 @@ void msi_compose_msg(unsigned vector, const cpumask_t 
*cpu_mask, struct msi_msg
                 MSI_DATA_VECTOR(vector);
 }
 
-static bool read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
-{
-    switch ( entry->msi_attrib.type )
-    {
-    case PCI_CAP_ID_MSI:
-    {
-        struct pci_dev *dev = entry->dev;
-        int pos = entry->msi_attrib.pos;
-        uint16_t data;
-
-        msg->address_lo = pci_conf_read32(dev->sbdf,
-                                          msi_lower_address_reg(pos));
-        if ( entry->msi_attrib.is_64 )
-        {
-            msg->address_hi = pci_conf_read32(dev->sbdf,
-                                              msi_upper_address_reg(pos));
-            data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 1));
-        }
-        else
-        {
-            msg->address_hi = 0;
-            data = pci_conf_read16(dev->sbdf, msi_data_reg(pos, 0));
-        }
-        msg->data = data;
-        break;
-    }
-    case PCI_CAP_ID_MSIX:
-    {
-        void __iomem *base = entry->mask_base;
-
-        if ( unlikely(!msix_memory_decoded(entry->dev,
-                                           entry->msi_attrib.pos)) )
-            return false;
-        msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
-        msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
-        msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
-        break;
-    }
-    default:
-        BUG();
-    }
-
-    if ( iommu_intremap )
-        iommu_read_msi_from_ire(entry, msg);
-
-    return true;
-}
-
 static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
     entry->msg = *msg;
@@ -304,10 +256,7 @@ void set_msi_affinity(struct irq_desc *desc, const 
cpumask_t *mask)
 
     ASSERT(spin_is_locked(&desc->lock));
 
-    memset(&msg, 0, sizeof(msg));
-    if ( !read_msi_msg(msi_desc, &msg) )
-        return;
-
+    msg = msi_desc->msg;
     msg.data &= ~MSI_DATA_VECTOR_MASK;
     msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
     msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
++++++ 5f6a061a-MSI-X-restrict-reading-of-table-PBA-bases-from-BARs.patch ++++++
Subject: x86/MSI-X: restrict reading of table/PBA bases from BARs
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:11:38 2020 +0200
Date: Tue Sep 22 16:11:38 2020 +0200:
Git: 0bc4177e6b0d7a98464913af95d3bfe4b59b7a2c

When assigned to less trusted or un-trusted guests, devices may change
state behind our backs (they may e.g. get reset by means we may not know
about). Therefore we should avoid reading BARs from hardware once a
device is no longer owned by Dom0. Furthermore when we can't read a BAR,
or when we read zero, we shouldn't instead use the caller provided
address unless that caller can be trusted.

Re-arrange the logic in msix_capability_init() such that only Dom0 (and
only if the device isn't DomU-owned yet) or calls through
PHYSDEVOP_prepare_msix will actually result in the reading of the
respective BAR register(s). Additionally do so only as long as in-use
table entries are known (note that invocation of PHYSDEVOP_prepare_msix
counts as a "pseudo" entry). In all other uses the value already
recorded will get used instead.

Clear the recorded values in _pci_cleanup_msix() as well as on the one
affected error path. (Adjust this error path to also avoid blindly
disabling MSI-X when it was enabled on entry to the function.)

While moving around variable declarations (in many cases to reduce their
scopes), also adjust some of their types.

This is part of XSA-337.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau Monné <roger....@citrix.com>

diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 6a00bc9312..c8b0e5334f 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -771,16 +771,14 @@ static int msix_capability_init(struct pci_dev *dev,
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
-    int vf;
     u16 control;
     u64 table_paddr;
     u32 table_offset;
-    u8 bir, pbus, pslot, pfunc;
     u16 seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
-    bool maskall = msix->host_maskall;
+    bool maskall = msix->host_maskall, zap_on_error = false;
     unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
                                            PCI_CAP_ID_MSIX);
 
@@ -822,43 +820,45 @@ static int msix_capability_init(struct pci_dev *dev,
 
     /* Locate MSI-X table region */
     table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos));
-    bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
-    table_offset &= ~PCI_MSIX_BIRMASK;
-
-    if ( !dev->info.is_virtfn )
+    if ( !msix->used_entries &&
+         (!msi ||
+          (is_hardware_domain(current->domain) &&
+           (dev->domain == current->domain || dev->domain == dom_io))) )
     {
-        pbus = bus;
-        pslot = slot;
-        pfunc = func;
-        vf = -1;
-    }
-    else
-    {
-        pbus = dev->info.physfn.bus;
-        pslot = PCI_SLOT(dev->info.physfn.devfn);
-        pfunc = PCI_FUNC(dev->info.physfn.devfn);
-        vf = PCI_BDF2(dev->bus, dev->devfn);
-    }
+        unsigned int bir = table_offset & PCI_MSIX_BIRMASK, pbus, pslot, pfunc;
+        int vf;
+        paddr_t pba_paddr;
+        unsigned int pba_offset;
 
-    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
-    WARN_ON(msi && msi->table_base != table_paddr);
-    if ( !table_paddr )
-    {
-        if ( !msi || !msi->table_base )
+        if ( !dev->info.is_virtfn )
         {
-            pci_conf_write16(dev->sbdf, msix_control_reg(pos),
-                             control & ~PCI_MSIX_FLAGS_ENABLE);
-            xfree(entry);
-            return -ENXIO;
+            pbus = bus;
+            pslot = slot;
+            pfunc = func;
+            vf = -1;
+        }
+        else
+        {
+            pbus = dev->info.physfn.bus;
+            pslot = PCI_SLOT(dev->info.physfn.devfn);
+            pfunc = PCI_FUNC(dev->info.physfn.devfn);
+            vf = PCI_BDF2(dev->bus, dev->devfn);
         }
-        table_paddr = msi->table_base;
-    }
-    table_paddr += table_offset;
 
-    if ( !msix->used_entries )
-    {
-        u64 pba_paddr;
-        u32 pba_offset;
+        table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+        WARN_ON(msi && msi->table_base != table_paddr);
+        if ( !table_paddr )
+        {
+            if ( !msi || !msi->table_base )
+            {
+                pci_conf_write16(dev->sbdf, msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
+                xfree(entry);
+                return -ENXIO;
+            }
+            table_paddr = msi->table_base;
+        }
+        table_paddr += table_offset & ~PCI_MSIX_BIRMASK;
 
         msix->table.first = PFN_DOWN(table_paddr);
         msix->table.last = PFN_DOWN(table_paddr +
@@ -877,7 +877,18 @@ static int msix_capability_init(struct pci_dev *dev,
                                   BITS_TO_LONGS(msix->nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first,
                                         msix->pba.last));
+
+        zap_on_error = true;
+    }
+    else if ( !msix->table.first )
+    {
+        pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+        xfree(entry);
+        return -ENODATA;
     }
+    else
+        table_paddr = (msix->table.first << PAGE_SHIFT) +
+                      PAGE_OFFSET(table_offset & ~PCI_MSIX_BIRMASK);
 
     if ( entry )
     {
@@ -888,8 +899,15 @@ static int msix_capability_init(struct pci_dev *dev,
 
         if ( idx < 0 )
         {
-            pci_conf_write16(dev->sbdf, msix_control_reg(pos),
-                             control & ~PCI_MSIX_FLAGS_ENABLE);
+            if ( zap_on_error )
+            {
+                msix->table.first = 0;
+                msix->pba.first = 0;
+
+                control &= ~PCI_MSIX_FLAGS_ENABLE;
+            }
+
+            pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
             xfree(entry);
             return idx;
         }
@@ -1078,9 +1096,14 @@ static void _pci_cleanup_msix(struct arch_msix *msix)
         if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first,
                                    msix->table.last) )
             WARN();
+        msix->table.first = 0;
+        msix->table.last = 0;
+
         if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first,
                                    msix->pba.last) )
             WARN();
+        msix->pba.first = 0;
+        msix->pba.last = 0;
     }
 }
 
++++++ 5f6a062c-evtchn-relax-port_is_valid.patch ++++++
Subject: evtchn: relax port_is_valid()
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:11:56 2020 +0200
Date: Tue Sep 22 16:11:56 2020 +0200:
Git: e417504febf4ef5cc3442ebaa00da1a5c7939b22

To avoid ports potentially becoming invalid behind the back of certain
other functions (due to ->max_evtchn shrinking) because of
- a guest invoking evtchn_reset() and from a 2nd vCPU opening new
  channels in parallel (see also XSA-343),
- alloc_unbound_xen_event_channel() produced channels living above the
  2-level range (see also XSA-342),
drop the max_evtchns check from port_is_valid(). For a port for which
the function once returned "true", the returned value may not turn into
"false" later on. The function's result may only depend on bounds which
can only ever grow (which is the case for d->valid_evtchns).

This also eliminates a false sense of safety, utilized by some of the
users (see again XSA-343): Without a suitable lock held, d->max_evtchns
may change at any time, and hence deducing that certain other operations
are safe when port_is_valid() returned true is not legitimate. The
opportunities to abuse this may get widened by the change here
(depending on guest and host configuration), but will be taken care of
by the other XSA.

This is XSA-338.

Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of 
event channels")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>
Reviewed-by: Julien Grall <jgr...@amazon.com>

diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index a7798f6765..ce45298377 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -107,8 +107,6 @@ void notify_via_xen_event_channel(struct domain *ld, int 
lport);
 
 static inline bool_t port_is_valid(struct domain *d, unsigned int p)
 {
-    if ( p >= d->max_evtchns )
-        return 0;
     return p < read_atomic(&d->valid_evtchns);
 }
 
++++++ 5f6a065c-pv-Avoid-double-exception-injection.patch ++++++
Subject: x86/pv: Avoid double exception injection
From: Andrew Cooper andrew.coop...@citrix.com Tue Sep 22 16:12:44 2020 +0200
Date: Tue Sep 22 16:12:44 2020 +0200:
Git: eb4a543a47df33038f4f6e915375ea898670c7dc

There is at least one path (SYSENTER with NT set, Xen converts to #GP) which
ends up injecting the #GP fault twice, first in compat_sysenter(), and then a
second time in compat_test_all_events(), due to the stale TBF_EXCEPTION left
in TRAPBOUNCE_flags.

The guest kernel sees the second fault first, which is a kernel level #GP
pointing at the head of the #GP handler, and is therefore a userspace
trigger-able DoS.

This particular bug has bitten us several times before, so rearrange
{compat_,}create_bounce_frame() to clobber TRAPBOUNCE on success, rather than
leaving this task to one area of code which isn't used uniformly.

Other scenarios which might result in a double injection (e.g. two calls
directly to compat_create_bounce_frame) will now crash the guest, which is far
more obvious than letting the kernel run with corrupt state.

This is XSA-339

Fixes: fdac9515607b ("x86: clear EFLAGS.NT in SYSENTER entry path")
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

diff --git a/xen/arch/x86/x86_64/compat/entry.S 
b/xen/arch/x86/x86_64/compat/entry.S
index c3e62f8734..73619f57ca 100644
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -78,7 +78,6 @@ compat_process_softirqs:
         sti
 .Lcompat_bounce_exception:
         call  compat_create_bounce_frame
-        movb  $0, TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
        ALIGN
@@ -352,7 +351,13 @@ __UNLIKELY_END(compat_bounce_null_selector)
         movl  %eax,UREGS_cs+8(%rsp)
         movl  TRAPBOUNCE_eip(%rdx),%eax
         movl  %eax,UREGS_rip+8(%rsp)
+
+        /* Trapbounce complete.  Clobber state to avoid an erroneous second 
injection. */
+        xor   %eax, %eax
+        mov   %ax,  TRAPBOUNCE_cs(%rdx)
+        mov   %al,  TRAPBOUNCE_flags(%rdx)
         ret
+
 .section .fixup,"ax"
 .Lfx13:
         xorl  %edi,%edi
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 1e880eb9f6..71a00e846b 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -90,7 +90,6 @@ process_softirqs:
         sti
 .Lbounce_exception:
         call  create_bounce_frame
-        movb  $0, TRAPBOUNCE_flags(%rdx)
         jmp   test_all_events
 
         ALIGN
@@ -512,6 +511,11 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip)
         jmp   asm_domain_crash_synchronous  /* Does not return */
 __UNLIKELY_END(create_bounce_frame_bad_bounce_ip)
         movq  %rax,UREGS_rip+8(%rsp)
+
+        /* Trapbounce complete.  Clobber state to avoid an erroneous second 
injection. */
+        xor   %eax, %eax
+        mov   %rax, TRAPBOUNCE_eip(%rdx)
+        mov   %al,  TRAPBOUNCE_flags(%rdx)
         ret
 
         .pushsection .fixup, "ax", @progbits
++++++ 
5f6a0674-xen-evtchn-Add-missing-barriers-when-accessing-allocating-an-event-channel.patch
 ++++++
Subject: xen/evtchn: Add missing barriers when accessing/allocating an event 
channel
From: Julien Grall jgr...@amazon.com Tue Sep 22 16:13:08 2020 +0200
Date: Tue Sep 22 16:13:08 2020 +0200:
Git: f5469067ee0260673ca1e554ff8888512a55ccfc

While the allocation of a bucket is always performed with the per-domain
lock, the bucket may be accessed without the lock taken (for instance, see
evtchn_send()).

Instead such sites relies on port_is_valid() to return a non-zero value
when the port has a struct evtchn associated to it. The function will
mostly check whether the port is less than d->valid_evtchns as all the
buckets/event channels should be allocated up to that point.

Unfortunately a compiler is free to re-order the assignment in
evtchn_allocate_port() so it would be possible to have d->valid_evtchns
updated before the new bucket has finish to allocate.

Additionally on Arm, even if this was compiled "correctly", the
processor can still re-order the memory access.

Add a write memory barrier in the allocation side and a read memory
barrier when the port is valid to prevent any re-ordering issue.

This is XSA-340.

Reported-by: Julien Grall <jgr...@amazon.com>
Signed-off-by: Julien Grall <jgr...@amazon.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>

diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index a8d182b584..53c17bd354 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -178,6 +178,13 @@ int evtchn_allocate_port(struct domain *d, evtchn_port_t 
port)
             return -ENOMEM;
         bucket_from_port(d, port) = chn;
 
+        /*
+         * d->valid_evtchns is used to check whether the bucket can be
+         * accessed without the per-domain lock. Therefore,
+         * d->valid_evtchns should be seen *after* the new bucket has
+         * been setup.
+         */
+        smp_wmb();
         write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
     }
 
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index ce45298377..c35f4b23b6 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -107,7 +107,17 @@ void notify_via_xen_event_channel(struct domain *ld, int 
lport);
 
 static inline bool_t port_is_valid(struct domain *d, unsigned int p)
 {
-    return p < read_atomic(&d->valid_evtchns);
+    if ( p >= read_atomic(&d->valid_evtchns) )
+        return false;
+
+    /*
+     * The caller will usually access the event channel afterwards and
+     * may be done without taking the per-domain lock. The barrier is
+     * going in pair the smp_wmb() barrier in evtchn_allocate_port().
+     */
+    smp_rmb();
+
+    return true;
 }
 
 static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
++++++ 5f6a068e-evtchn-x86-enforce-correct-upper-limit-for-32-bit-guests.patch 
++++++
Subject: evtchn/x86: enforce correct upper limit for 32-bit guests
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:13:34 2020 +0200
Date: Tue Sep 22 16:13:34 2020 +0200:
Git: b8c2efbe7b3e8fa5f0b0a3679afccd1204949070

The recording of d->max_evtchns in evtchn_2l_init(), in particular with
the limited set of callers of the function, is insufficient. Neither for
PV nor for HVM guests the bitness is known at domain_create() time, yet
the upper bound in 2-level mode depends upon guest bitness. Recording
too high a limit "allows" x86 32-bit domains to open not properly usable
event channels, management of which (inside Xen) would then result in
corruption of the shared info and vCPU info structures.

Keep the upper limit dynamic for the 2-level case, introducing a helper
function to retrieve the effective limit. This helper is now supposed to
be private to the event channel code. The used in do_poll() and
domain_dump_evtchn_info() weren't consistent with port uses elsewhere
and hence get switched to port_is_valid().

Furthermore FIFO mode's setup_ports() gets adjusted to loop only up to
the prior ABI limit, rather than all the way up to the new one.

Finally a word on the change to do_poll(): Accessing ->max_evtchns
without holding a suitable lock was never safe, as it as well as
->evtchn_port_ops may change behind do_poll()'s back. Using
port_is_valid() instead widens some the window for potential abuse,
until we've dealt with the race altogether (see XSA-343).

This is XSA-342.

Reported-by: Julien Grall <jgr...@amazon.com>
Fixes: 48974e6ce52e ("evtchn: use a per-domain variable for the max number of 
event channels")
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>
Reviewed-by: Julien Grall <jgr...@amazon.com>

diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c
index e1dbb860f4..a229d35271 100644
--- a/xen/common/event_2l.c
+++ b/xen/common/event_2l.c
@@ -103,7 +103,6 @@ static const struct evtchn_port_ops evtchn_port_ops_2l =
 void evtchn_2l_init(struct domain *d)
 {
     d->evtchn_port_ops = &evtchn_port_ops_2l;
-    d->max_evtchns = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
 }
 
 /*
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 53c17bd354..08ffe0f063 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -151,7 +151,7 @@ static void free_evtchn_bucket(struct domain *d, struct 
evtchn *bucket)
 
 int evtchn_allocate_port(struct domain *d, evtchn_port_t port)
 {
-    if ( port > d->max_evtchn_port || port >= d->max_evtchns )
+    if ( port > d->max_evtchn_port || port >= max_evtchns(d) )
         return -ENOSPC;
 
     if ( port_is_valid(d, port) )
@@ -1396,13 +1396,11 @@ static void domain_dump_evtchn_info(struct domain *d)
 
     spin_lock(&d->event_lock);
 
-    for ( port = 1; port < d->max_evtchns; ++port )
+    for ( port = 1; port_is_valid(d, port); ++port )
     {
         const struct evtchn *chn;
         char *ssid;
 
-        if ( !port_is_valid(d, port) )
-            continue;
         chn = evtchn_from_port(d, port);
         if ( chn->state == ECS_FREE )
             continue;
diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c
index 230f440f14..2f13d92128 100644
--- a/xen/common/event_fifo.c
+++ b/xen/common/event_fifo.c
@@ -478,7 +478,7 @@ static void cleanup_event_array(struct domain *d)
     d->evtchn_fifo = NULL;
 }
 
-static void setup_ports(struct domain *d)
+static void setup_ports(struct domain *d, unsigned int prev_evtchns)
 {
     unsigned int port;
 
@@ -488,7 +488,7 @@ static void setup_ports(struct domain *d)
      * - save its pending state.
      * - set default priority.
      */
-    for ( port = 1; port < d->max_evtchns; port++ )
+    for ( port = 1; port < prev_evtchns; port++ )
     {
         struct evtchn *evtchn;
 
@@ -546,6 +546,8 @@ int evtchn_fifo_init_control(struct evtchn_init_control 
*init_control)
     if ( !d->evtchn_fifo )
     {
         struct vcpu *vcb;
+        /* Latch the value before it changes during setup_event_array(). */
+        unsigned int prev_evtchns = max_evtchns(d);
 
         for_each_vcpu ( d, vcb ) {
             rc = setup_control_block(vcb);
@@ -562,8 +564,7 @@ int evtchn_fifo_init_control(struct evtchn_init_control 
*init_control)
             goto error;
 
         d->evtchn_port_ops = &evtchn_port_ops_fifo;
-        d->max_evtchns = EVTCHN_FIFO_NR_CHANNELS;
-        setup_ports(d);
+        setup_ports(d, prev_evtchns);
     }
     else
         rc = map_control_block(v, gfn, offset);
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index cb49a8bc02..ab94d2ec3a 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -1428,7 +1428,7 @@ static long do_poll(struct sched_poll *sched_poll)
             goto out;
 
         rc = -EINVAL;
-        if ( port >= d->max_evtchns )
+        if ( !port_is_valid(d, port) )
             goto out;
 
         rc = 0;
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index c35f4b23b6..e1b299e8df 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -105,6 +105,12 @@ void notify_via_xen_event_channel(struct domain *ld, int 
lport);
 #define bucket_from_port(d, p) \
     ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET])
 
+static inline unsigned int max_evtchns(const struct domain *d)
+{
+    return d->evtchn_fifo ? EVTCHN_FIFO_NR_CHANNELS
+                          : BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
+}
+
 static inline bool_t port_is_valid(struct domain *d, unsigned int p)
 {
     if ( p >= read_atomic(&d->valid_evtchns) )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index ac53519d7f..545f2bdcd0 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -359,7 +359,6 @@ struct domain
     /* Event channel information. */
     struct evtchn   *evtchn;                         /* first bucket only */
     struct evtchn  **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
-    unsigned int     max_evtchns;     /* number supported by ABI */
     unsigned int     max_evtchn_port; /* max permitted port number */
     unsigned int     valid_evtchns;   /* number of allocated event channels */
     spinlock_t       event_lock;
++++++ 
5f6a06be-evtchn-evtchn_reset-shouldnt-succeed-with-still-open-ports.patch ++++++
Subject: evtchn: evtchn_reset() shouldn't succeed with still-open ports
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:14:22 2020 +0200
Date: Tue Sep 22 16:14:22 2020 +0200:
Git: 9b9fc8e391b6d5afa83f90271fdbd0e13871e841

While the function closes all ports, it does so without holding any
lock, and hence racing requests may be issued causing new ports to get
opened. This would have been problematic in particular if such a newly
opened port had a port number above the new implementation limit (i.e.
when switching from FIFO to 2-level) after the reset, as prior to
"evtchn: relax port_is_valid()" this could have led to e.g.
evtchn_close()'s "BUG_ON(!port_is_valid(d2, port2))" to trigger.

Introduce a counter of active ports and check that it's (still) no
larger then the number of Xen internally used ones after obtaining the
necessary lock in evtchn_reset().

As to the access model of the new {active,xen}_evtchns fields - while
all writes get done using write_atomic(), reads ought to use
read_atomic() only when outside of a suitably locked region.

Note that as of now evtchn_bind_virq() and evtchn_bind_ipi() don't have
a need to call check_free_port().

This is part of XSA-343.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>
Reviewed-by: Julien Grall <jgr...@amazon.com>

diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 08ffe0f063..6a566917b1 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -188,6 +188,8 @@ int evtchn_allocate_port(struct domain *d, evtchn_port_t 
port)
         write_atomic(&d->valid_evtchns, d->valid_evtchns + EVTCHNS_PER_BUCKET);
     }
 
+    write_atomic(&d->active_evtchns, d->active_evtchns + 1);
+
     return 0;
 }
 
@@ -211,11 +213,26 @@ static int get_free_port(struct domain *d)
     return -ENOSPC;
 }
 
+/*
+ * Check whether a port is still marked free, and if so update the domain
+ * counter accordingly.  To be used on function exit paths.
+ */
+static void check_free_port(struct domain *d, evtchn_port_t port)
+{
+    if ( port_is_valid(d, port) &&
+         evtchn_from_port(d, port)->state == ECS_FREE )
+        write_atomic(&d->active_evtchns, d->active_evtchns - 1);
+}
+
 void evtchn_free(struct domain *d, struct evtchn *chn)
 {
     /* Clear pending event to avoid unexpected behavior on re-bind. */
     evtchn_port_clear_pending(d, chn);
 
+    if ( consumer_is_xen(chn) )
+        write_atomic(&d->xen_evtchns, d->xen_evtchns - 1);
+    write_atomic(&d->active_evtchns, d->active_evtchns - 1);
+
     /* Reset binding to vcpu0 when the channel is freed. */
     chn->state          = ECS_FREE;
     chn->notify_vcpu_id = 0;
@@ -258,6 +275,7 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t 
*alloc)
     alloc->port = port;
 
  out:
+    check_free_port(d, port);
     spin_unlock(&d->event_lock);
     rcu_unlock_domain(d);
 
@@ -351,6 +369,7 @@ static long 
evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
     bind->local_port = lport;
 
  out:
+    check_free_port(ld, lport);
     spin_unlock(&ld->event_lock);
     if ( ld != rd )
         spin_unlock(&rd->event_lock);
@@ -488,7 +507,7 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
     struct domain *d = current->domain;
     struct vcpu   *v = d->vcpu[0];
     struct pirq   *info;
-    int            port, pirq = bind->pirq;
+    int            port = 0, pirq = bind->pirq;
     long           rc;
 
     if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
@@ -536,6 +555,7 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
     arch_evtchn_bind_pirq(d, pirq);
 
  out:
+    check_free_port(d, port);
     spin_unlock(&d->event_lock);
 
     return rc;
@@ -1011,10 +1031,10 @@ int evtchn_unmask(unsigned int port)
     return 0;
 }
 
-
 int evtchn_reset(struct domain *d)
 {
     unsigned int i;
+    int rc = 0;
 
     if ( d != current->domain && !d->controller_pause_count )
         return -EINVAL;
@@ -1024,7 +1044,9 @@ int evtchn_reset(struct domain *d)
 
     spin_lock(&d->event_lock);
 
-    if ( d->evtchn_fifo )
+    if ( d->active_evtchns > d->xen_evtchns )
+        rc = -EAGAIN;
+    else if ( d->evtchn_fifo )
     {
         /* Switching back to 2-level ABI. */
         evtchn_fifo_destroy(d);
@@ -1033,7 +1055,7 @@ int evtchn_reset(struct domain *d)
 
     spin_unlock(&d->event_lock);
 
-    return 0;
+    return rc;
 }
 
 static long evtchn_set_priority(const struct evtchn_set_priority *set_priority)
@@ -1219,10 +1241,9 @@ int alloc_unbound_xen_event_channel(
 
     spin_lock(&ld->event_lock);
 
-    rc = get_free_port(ld);
+    port = rc = get_free_port(ld);
     if ( rc < 0 )
         goto out;
-    port = rc;
     chn = evtchn_from_port(ld, port);
 
     rc = xsm_evtchn_unbound(XSM_TARGET, ld, chn, remote_domid);
@@ -1238,7 +1259,10 @@ int alloc_unbound_xen_event_channel(
 
     spin_unlock(&chn->lock);
 
+    write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
+
  out:
+    check_free_port(ld, port);
     spin_unlock(&ld->event_lock);
 
     return rc < 0 ? rc : port;
@@ -1314,6 +1338,7 @@ int evtchn_init(struct domain *d, unsigned int max_port)
         return -EINVAL;
     }
     evtchn_from_port(d, 0)->state = ECS_RESERVED;
+    write_atomic(&d->active_evtchns, 0);
 
 #if MAX_VIRT_CPUS > BITS_PER_LONG
     d->poll_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus));
@@ -1340,6 +1365,8 @@ void evtchn_destroy(struct domain *d)
     for ( i = 0; port_is_valid(d, i); i++ )
         evtchn_close(d, i, 0);
 
+    ASSERT(!d->active_evtchns);
+
     clear_global_virq_handlers(d);
 
     evtchn_fifo_destroy(d);
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 545f2bdcd0..b9c70da4b0 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -361,6 +361,16 @@ struct domain
     struct evtchn  **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */
     unsigned int     max_evtchn_port; /* max permitted port number */
     unsigned int     valid_evtchns;   /* number of allocated event channels */
+    /*
+     * Number of in-use event channels.  Writers should use write_atomic().
+     * Readers need to use read_atomic() only when not holding event_lock.
+     */
+    unsigned int     active_evtchns;
+    /*
+     * Number of event channels used internally by Xen (not subject to
+     * EVTCHNOP_reset).  Read/write access like for active_evtchns.
+     */
+    unsigned int     xen_evtchns;
     spinlock_t       event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;
++++++ 5f6a06e0-evtchn-convert-per-channel-lock-to-be-IRQ-safe.patch ++++++
Subject: evtchn: convert per-channel lock to be IRQ-safe
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:14:56 2020 +0200
Date: Tue Sep 22 16:14:56 2020 +0200:
Git: 2ee270e126458471b178ca1e5d7d8d0afc48be39

... in order for send_guest_{global,vcpu}_virq() to be able to make use
of it.

This is part of XSA-343.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Julien Grall <jgr...@amazon.com>

diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 6a566917b1..0e550e9c7a 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -248,6 +248,7 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t 
*alloc)
     int            port;
     domid_t        dom = alloc->dom;
     long           rc;
+    unsigned long  flags;
 
     d = rcu_lock_domain_by_any_id(dom);
     if ( d == NULL )
@@ -263,14 +264,14 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t 
*alloc)
     if ( rc )
         goto out;
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state = ECS_UNBOUND;
     if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF )
         chn->u.unbound.remote_domid = current->domain->domain_id;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     alloc->port = port;
 
@@ -283,26 +284,32 @@ static long evtchn_alloc_unbound(evtchn_alloc_unbound_t 
*alloc)
 }
 
 
-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
+static unsigned long double_evtchn_lock(struct evtchn *lchn,
+                                        struct evtchn *rchn)
 {
-    if ( lchn < rchn )
+    unsigned long flags;
+
+    if ( lchn <= rchn )
     {
-        spin_lock(&lchn->lock);
-        spin_lock(&rchn->lock);
+        spin_lock_irqsave(&lchn->lock, flags);
+        if ( lchn != rchn )
+            spin_lock(&rchn->lock);
     }
     else
     {
-        if ( lchn != rchn )
-            spin_lock(&rchn->lock);
+        spin_lock_irqsave(&rchn->lock, flags);
         spin_lock(&lchn->lock);
     }
+
+    return flags;
 }
 
-static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn)
+static void double_evtchn_unlock(struct evtchn *lchn, struct evtchn *rchn,
+                                 unsigned long flags)
 {
-    spin_unlock(&lchn->lock);
     if ( lchn != rchn )
-        spin_unlock(&rchn->lock);
+        spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&rchn->lock, flags);
 }
 
 static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
@@ -312,6 +319,7 @@ static long 
evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
     int            lport, rport = bind->remote_port;
     domid_t        rdom = bind->remote_dom;
     long           rc;
+    unsigned long  flags;
 
     if ( rdom == DOMID_SELF )
         rdom = current->domain->domain_id;
@@ -347,7 +355,7 @@ static long 
evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
     if ( rc )
         goto out;
 
-    double_evtchn_lock(lchn, rchn);
+    flags = double_evtchn_lock(lchn, rchn);
 
     lchn->u.interdomain.remote_dom  = rd;
     lchn->u.interdomain.remote_port = rport;
@@ -364,7 +372,7 @@ static long 
evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
      */
     evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
 
-    double_evtchn_unlock(lchn, rchn);
+    double_evtchn_unlock(lchn, rchn, flags);
 
     bind->local_port = lport;
 
@@ -387,6 +395,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t *bind, 
evtchn_port_t port)
     struct domain *d = current->domain;
     int            virq = bind->virq, vcpu = bind->vcpu;
     int            rc = 0;
+    unsigned long  flags;
 
     if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) )
         return -EINVAL;
@@ -424,14 +433,14 @@ int evtchn_bind_virq(evtchn_bind_virq_t *bind, 
evtchn_port_t port)
 
     chn = evtchn_from_port(d, port);
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state          = ECS_VIRQ;
     chn->notify_vcpu_id = vcpu;
     chn->u.virq         = virq;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     v->virq_to_evtchn[virq] = bind->port = port;
 
@@ -448,6 +457,7 @@ static long evtchn_bind_ipi(evtchn_bind_ipi_t *bind)
     struct domain *d = current->domain;
     int            port, vcpu = bind->vcpu;
     long           rc = 0;
+    unsigned long  flags;
 
     if ( domain_vcpu(d, vcpu) == NULL )
         return -ENOENT;
@@ -459,13 +469,13 @@ static long evtchn_bind_ipi(evtchn_bind_ipi_t *bind)
 
     chn = evtchn_from_port(d, port);
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state          = ECS_IPI;
     chn->notify_vcpu_id = vcpu;
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     bind->port = port;
 
@@ -509,6 +519,7 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
     struct pirq   *info;
     int            port = 0, pirq = bind->pirq;
     long           rc;
+    unsigned long  flags;
 
     if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
         return -EINVAL;
@@ -541,14 +552,14 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
         goto out;
     }
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state  = ECS_PIRQ;
     chn->u.pirq.irq = pirq;
     link_pirq_port(port, chn, v);
     evtchn_port_init(d, chn);
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     bind->port = port;
 
@@ -569,6 +580,7 @@ int evtchn_close(struct domain *d1, int port1, bool guest)
     struct evtchn *chn1, *chn2;
     int            port2;
     long           rc = 0;
+    unsigned long  flags;
 
  again:
     spin_lock(&d1->event_lock);
@@ -668,14 +680,14 @@ int evtchn_close(struct domain *d1, int port1, bool guest)
         BUG_ON(chn2->state != ECS_INTERDOMAIN);
         BUG_ON(chn2->u.interdomain.remote_dom != d1);
 
-        double_evtchn_lock(chn1, chn2);
+        flags = double_evtchn_lock(chn1, chn2);
 
         evtchn_free(d1, chn1);
 
         chn2->state = ECS_UNBOUND;
         chn2->u.unbound.remote_domid = d1->domain_id;
 
-        double_evtchn_unlock(chn1, chn2);
+        double_evtchn_unlock(chn1, chn2, flags);
 
         goto out;
 
@@ -683,9 +695,9 @@ int evtchn_close(struct domain *d1, int port1, bool guest)
         BUG();
     }
 
-    spin_lock(&chn1->lock);
+    spin_lock_irqsave(&chn1->lock, flags);
     evtchn_free(d1, chn1);
-    spin_unlock(&chn1->lock);
+    spin_unlock_irqrestore(&chn1->lock, flags);
 
  out:
     if ( d2 != NULL )
@@ -705,13 +717,14 @@ int evtchn_send(struct domain *ld, unsigned int lport)
     struct evtchn *lchn, *rchn;
     struct domain *rd;
     int            rport, ret = 0;
+    unsigned long  flags;
 
     if ( !port_is_valid(ld, lport) )
         return -EINVAL;
 
     lchn = evtchn_from_port(ld, lport);
 
-    spin_lock(&lchn->lock);
+    spin_lock_irqsave(&lchn->lock, flags);
 
     /* Guest cannot send via a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(lchn)) )
@@ -746,7 +759,7 @@ int evtchn_send(struct domain *ld, unsigned int lport)
     }
 
 out:
-    spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&lchn->lock, flags);
 
     return ret;
 }
@@ -1238,6 +1251,7 @@ int alloc_unbound_xen_event_channel(
 {
     struct evtchn *chn;
     int            port, rc;
+    unsigned long  flags;
 
     spin_lock(&ld->event_lock);
 
@@ -1250,14 +1264,14 @@ int alloc_unbound_xen_event_channel(
     if ( rc )
         goto out;
 
-    spin_lock(&chn->lock);
+    spin_lock_irqsave(&chn->lock, flags);
 
     chn->state = ECS_UNBOUND;
     chn->xen_consumer = get_xen_consumer(notification_fn);
     chn->notify_vcpu_id = lvcpu;
     chn->u.unbound.remote_domid = remote_domid;
 
-    spin_unlock(&chn->lock);
+    spin_unlock_irqrestore(&chn->lock, flags);
 
     write_atomic(&ld->xen_evtchns, ld->xen_evtchns + 1);
 
@@ -1280,11 +1294,12 @@ void notify_via_xen_event_channel(struct domain *ld, 
int lport)
 {
     struct evtchn *lchn, *rchn;
     struct domain *rd;
+    unsigned long flags;
 
     ASSERT(port_is_valid(ld, lport));
     lchn = evtchn_from_port(ld, lport);
 
-    spin_lock(&lchn->lock);
+    spin_lock_irqsave(&lchn->lock, flags);
 
     if ( likely(lchn->state == ECS_INTERDOMAIN) )
     {
@@ -1294,7 +1309,7 @@ void notify_via_xen_event_channel(struct domain *ld, int 
lport)
         evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
     }
 
-    spin_unlock(&lchn->lock);
+    spin_unlock_irqrestore(&lchn->lock, flags);
 }
 
 void evtchn_check_pollers(struct domain *d, unsigned int port)
++++++ 5f6a06f2-evtchn-address-races-with-evtchn_reset.patch ++++++
Subject: evtchn: address races with evtchn_reset()
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:15:14 2020 +0200
Date: Tue Sep 22 16:15:14 2020 +0200:
Git: ecc6428b7ea63a24e244f747e8568c0ccc03a6f8

Neither d->evtchn_port_ops nor max_evtchns(d) may be used in an entirely
lock-less manner, as both may change by a racing evtchn_reset(). In the
common case, at least one of the domain's event lock or the per-channel
lock needs to be held. In the specific case of the inter-domain sending
by evtchn_send() and notify_via_xen_event_channel() holding the other
side's per-channel lock is sufficient, as the channel can't change state
without both per-channel locks held. Without such a channel changing
state, evtchn_reset() can't complete successfully.

Lock-free accesses continue to be permitted for the shim (calling some
otherwise internal event channel functions), as this happens while the
domain is in effectively single-threaded mode. Special care also needs
taking for the shim's marking of in-use ports as ECS_RESERVED (allowing
use of such ports in the shim case is okay because switching into and
hence also out of FIFO mode is impossihble there).

As a side effect, certain operations on Xen bound event channels which
were mistakenly permitted so far (e.g. unmask or poll) will be refused
now.

This is part of XSA-343.

Reported-by: Julien Grall <jgr...@amazon.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Julien Grall <jgr...@amazon.com>

diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index a69937c840..93c4fb9a79 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2488,14 +2488,24 @@ static void dump_irqs(unsigned char key)
 
             for ( i = 0; i < action->nr_guests; )
             {
+                struct evtchn *evtchn;
+                unsigned int pending = 2, masked = 2;
+
                 d = action->guest[i++];
                 pirq = domain_irq_to_pirq(d, irq);
                 info = pirq_info(d, pirq);
+                evtchn = evtchn_from_port(d, info->evtchn);
+                local_irq_disable();
+                if ( spin_trylock(&evtchn->lock) )
+                {
+                    pending = evtchn_is_pending(d, evtchn);
+                    masked = evtchn_is_masked(d, evtchn);
+                    spin_unlock(&evtchn->lock);
+                }
+                local_irq_enable();
                 printk("d%d:%3d(%c%c%c)%c",
-                       d->domain_id, pirq,
-                       evtchn_port_is_pending(d, info->evtchn) ? 'P' : '-',
-                       evtchn_port_is_masked(d, info->evtchn) ? 'M' : '-',
-                       info->masked ? 'M' : '-',
+                       d->domain_id, pirq, "-P?"[pending],
+                       "-M?"[masked], info->masked ? 'M' : '-',
                        i < action->nr_guests ? ',' : '\n');
             }
         }
diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c
index 3a0525c209..9aef7a860a 100644
--- a/xen/arch/x86/pv/shim.c
+++ b/xen/arch/x86/pv/shim.c
@@ -660,8 +660,11 @@ void pv_shim_inject_evtchn(unsigned int port)
     if ( port_is_valid(guest, port) )
     {
         struct evtchn *chn = evtchn_from_port(guest, port);
+        unsigned long flags;
 
+        spin_lock_irqsave(&chn->lock, flags);
         evtchn_port_set_pending(guest, chn->notify_vcpu_id, chn);
+        spin_unlock_irqrestore(&chn->lock, flags);
     }
 }
 
diff --git a/xen/common/event_2l.c b/xen/common/event_2l.c
index a229d35271..083d04be3c 100644
--- a/xen/common/event_2l.c
+++ b/xen/common/event_2l.c
@@ -63,8 +63,10 @@ static void evtchn_2l_unmask(struct domain *d, struct evtchn 
*evtchn)
     }
 }
 
-static bool evtchn_2l_is_pending(const struct domain *d, evtchn_port_t port)
+static bool evtchn_2l_is_pending(const struct domain *d,
+                                 const struct evtchn *evtchn)
 {
+    evtchn_port_t port = evtchn->port;
     unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
 
     ASSERT(port < max_ports);
@@ -72,8 +74,10 @@ static bool evtchn_2l_is_pending(const struct domain *d, 
evtchn_port_t port)
             guest_test_bit(d, port, &shared_info(d, evtchn_pending)));
 }
 
-static bool evtchn_2l_is_masked(const struct domain *d, evtchn_port_t port)
+static bool evtchn_2l_is_masked(const struct domain *d,
+                                const struct evtchn *evtchn)
 {
+    evtchn_port_t port = evtchn->port;
     unsigned int max_ports = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d);
 
     ASSERT(port < max_ports);
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 0e550e9c7a..878e4250ed 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -156,8 +156,9 @@ int evtchn_allocate_port(struct domain *d, evtchn_port_t 
port)
 
     if ( port_is_valid(d, port) )
     {
-        if ( evtchn_from_port(d, port)->state != ECS_FREE ||
-             evtchn_port_is_busy(d, port) )
+        const struct evtchn *chn = evtchn_from_port(d, port);
+
+        if ( chn->state != ECS_FREE || evtchn_is_busy(d, chn) )
             return -EBUSY;
     }
     else
@@ -774,6 +775,7 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
     unsigned long flags;
     int port;
     struct domain *d;
+    struct evtchn *chn;
 
     ASSERT(!virq_is_global(virq));
 
@@ -784,7 +786,10 @@ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq)
         goto out;
 
     d = v->domain;
-    evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port));
+    chn = evtchn_from_port(d, port);
+    spin_lock(&chn->lock);
+    evtchn_port_set_pending(d, v->vcpu_id, chn);
+    spin_unlock(&chn->lock);
 
  out:
     spin_unlock_irqrestore(&v->virq_lock, flags);
@@ -813,7 +818,9 @@ void send_guest_global_virq(struct domain *d, uint32_t virq)
         goto out;
 
     chn = evtchn_from_port(d, port);
+    spin_lock(&chn->lock);
     evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
+    spin_unlock(&chn->lock);
 
  out:
     spin_unlock_irqrestore(&v->virq_lock, flags);
@@ -823,6 +830,7 @@ void send_guest_pirq(struct domain *d, const struct pirq 
*pirq)
 {
     int port;
     struct evtchn *chn;
+    unsigned long flags;
 
     /*
      * PV guests: It should not be possible to race with __evtchn_close(). The
@@ -837,7 +845,9 @@ void send_guest_pirq(struct domain *d, const struct pirq 
*pirq)
     }
 
     chn = evtchn_from_port(d, port);
+    spin_lock_irqsave(&chn->lock, flags);
     evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
+    spin_unlock_irqrestore(&chn->lock, flags);
 }
 
 static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly;
@@ -1034,12 +1044,15 @@ int evtchn_unmask(unsigned int port)
 {
     struct domain *d = current->domain;
     struct evtchn *evtchn;
+    unsigned long flags;
 
     if ( unlikely(!port_is_valid(d, port)) )
         return -EINVAL;
 
     evtchn = evtchn_from_port(d, port);
+    spin_lock_irqsave(&evtchn->lock, flags);
     evtchn_port_unmask(d, evtchn);
+    spin_unlock_irqrestore(&evtchn->lock, flags);
 
     return 0;
 }
@@ -1449,8 +1462,8 @@ static void domain_dump_evtchn_info(struct domain *d)
 
         printk("    %4u [%d/%d/",
                port,
-               evtchn_port_is_pending(d, port),
-               evtchn_port_is_masked(d, port));
+               evtchn_is_pending(d, chn),
+               evtchn_is_masked(d, chn));
         evtchn_port_print_state(d, chn);
         printk("]: s=%d n=%d x=%d",
                chn->state, chn->notify_vcpu_id, chn->xen_consumer);
diff --git a/xen/common/event_fifo.c b/xen/common/event_fifo.c
index 2f13d92128..68d0c7a632 100644
--- a/xen/common/event_fifo.c
+++ b/xen/common/event_fifo.c
@@ -296,23 +296,26 @@ static void evtchn_fifo_unmask(struct domain *d, struct 
evtchn *evtchn)
         evtchn_fifo_set_pending(v, evtchn);
 }
 
-static bool evtchn_fifo_is_pending(const struct domain *d, evtchn_port_t port)
+static bool evtchn_fifo_is_pending(const struct domain *d,
+                                   const struct evtchn *evtchn)
 {
-    const event_word_t *word = evtchn_fifo_word_from_port(d, port);
+    const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
 
     return word && guest_test_bit(d, EVTCHN_FIFO_PENDING, word);
 }
 
-static bool_t evtchn_fifo_is_masked(const struct domain *d, evtchn_port_t port)
+static bool_t evtchn_fifo_is_masked(const struct domain *d,
+                                    const struct evtchn *evtchn)
 {
-    const event_word_t *word = evtchn_fifo_word_from_port(d, port);
+    const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
 
     return !word || guest_test_bit(d, EVTCHN_FIFO_MASKED, word);
 }
 
-static bool_t evtchn_fifo_is_busy(const struct domain *d, evtchn_port_t port)
+static bool_t evtchn_fifo_is_busy(const struct domain *d,
+                                  const struct evtchn *evtchn)
 {
-    const event_word_t *word = evtchn_fifo_word_from_port(d, port);
+    const event_word_t *word = evtchn_fifo_word_from_port(d, evtchn->port);
 
     return word && guest_test_bit(d, EVTCHN_FIFO_LINKED, word);
 }
diff --git a/xen/include/asm-x86/event.h b/xen/include/asm-x86/event.h
index 98a85233cb..5e09ede6d7 100644
--- a/xen/include/asm-x86/event.h
+++ b/xen/include/asm-x86/event.h
@@ -47,4 +47,10 @@ static inline bool arch_virq_is_global(unsigned int virq)
     return true;
 }
 
+#ifdef CONFIG_PV_SHIM
+# include <asm/pv/shim.h>
+# define arch_evtchn_is_special(chn) \
+             (pv_shim && (chn)->port && (chn)->state == ECS_RESERVED)
+#endif
+
 #endif
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index e1b299e8df..bc9aa68650 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -133,6 +133,24 @@ static inline struct evtchn *evtchn_from_port(struct 
domain *d, unsigned int p)
     return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET);
 }
 
+/*
+ * "usable" as in "by a guest", i.e. Xen consumed channels are assumed to be
+ * taken care of separately where used for Xen's internal purposes.
+ */
+static bool evtchn_usable(const struct evtchn *evtchn)
+{
+    if ( evtchn->xen_consumer )
+        return false;
+
+#ifdef arch_evtchn_is_special
+    if ( arch_evtchn_is_special(evtchn) )
+        return true;
+#endif
+
+    BUILD_BUG_ON(ECS_FREE > ECS_RESERVED);
+    return evtchn->state > ECS_RESERVED;
+}
+
 /* Wait on a Xen-attached event channel. */
 #define wait_on_xen_event_channel(port, condition)                      \
     do {                                                                \
@@ -165,19 +183,24 @@ int evtchn_reset(struct domain *d);
 
 /*
  * Low-level event channel port ops.
+ *
+ * All hooks have to be called with a lock held which prevents the channel
+ * from changing state. This may be the domain event lock, the per-channel
+ * lock, or in the case of sending interdomain events also the other side's
+ * per-channel lock. Exceptions apply in certain cases for the PV shim.
  */
 struct evtchn_port_ops {
     void (*init)(struct domain *d, struct evtchn *evtchn);
     void (*set_pending)(struct vcpu *v, struct evtchn *evtchn);
     void (*clear_pending)(struct domain *d, struct evtchn *evtchn);
     void (*unmask)(struct domain *d, struct evtchn *evtchn);
-    bool (*is_pending)(const struct domain *d, evtchn_port_t port);
-    bool (*is_masked)(const struct domain *d, evtchn_port_t port);
+    bool (*is_pending)(const struct domain *d, const struct evtchn *evtchn);
+    bool (*is_masked)(const struct domain *d, const struct evtchn *evtchn);
     /*
      * Is the port unavailable because it's still being cleaned up
      * after being closed?
      */
-    bool (*is_busy)(const struct domain *d, evtchn_port_t port);
+    bool (*is_busy)(const struct domain *d, const struct evtchn *evtchn);
     int (*set_priority)(struct domain *d, struct evtchn *evtchn,
                         unsigned int priority);
     void (*print_state)(struct domain *d, const struct evtchn *evtchn);
@@ -193,38 +216,67 @@ static inline void evtchn_port_set_pending(struct domain 
*d,
                                            unsigned int vcpu_id,
                                            struct evtchn *evtchn)
 {
-    d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn);
+    if ( evtchn_usable(evtchn) )
+        d->evtchn_port_ops->set_pending(d->vcpu[vcpu_id], evtchn);
 }
 
 static inline void evtchn_port_clear_pending(struct domain *d,
                                              struct evtchn *evtchn)
 {
-    d->evtchn_port_ops->clear_pending(d, evtchn);
+    if ( evtchn_usable(evtchn) )
+        d->evtchn_port_ops->clear_pending(d, evtchn);
 }
 
 static inline void evtchn_port_unmask(struct domain *d,
                                       struct evtchn *evtchn)
 {
-    d->evtchn_port_ops->unmask(d, evtchn);
+    if ( evtchn_usable(evtchn) )
+        d->evtchn_port_ops->unmask(d, evtchn);
 }
 
-static inline bool evtchn_port_is_pending(const struct domain *d,
-                                          evtchn_port_t port)
+static inline bool evtchn_is_pending(const struct domain *d,
+                                     const struct evtchn *evtchn)
 {
-    return d->evtchn_port_ops->is_pending(d, port);
+    return evtchn_usable(evtchn) && d->evtchn_port_ops->is_pending(d, evtchn);
 }
 
-static inline bool evtchn_port_is_masked(const struct domain *d,
-                                         evtchn_port_t port)
+static inline bool evtchn_port_is_pending(struct domain *d, evtchn_port_t port)
 {
-    return d->evtchn_port_ops->is_masked(d, port);
+    struct evtchn *evtchn = evtchn_from_port(d, port);
+    bool rc;
+    unsigned long flags;
+
+    spin_lock_irqsave(&evtchn->lock, flags);
+    rc = evtchn_is_pending(d, evtchn);
+    spin_unlock_irqrestore(&evtchn->lock, flags);
+
+    return rc;
+}
+
+static inline bool evtchn_is_masked(const struct domain *d,
+                                    const struct evtchn *evtchn)
+{
+    return !evtchn_usable(evtchn) || d->evtchn_port_ops->is_masked(d, evtchn);
+}
+
+static inline bool evtchn_port_is_masked(struct domain *d, evtchn_port_t port)
+{
+    struct evtchn *evtchn = evtchn_from_port(d, port);
+    bool rc;
+    unsigned long flags;
+
+    spin_lock_irqsave(&evtchn->lock, flags);
+    rc = evtchn_is_masked(d, evtchn);
+    spin_unlock_irqrestore(&evtchn->lock, flags);
+
+    return rc;
 }
 
-static inline bool evtchn_port_is_busy(const struct domain *d,
-                                       evtchn_port_t port)
+static inline bool evtchn_is_busy(const struct domain *d,
+                                  const struct evtchn *evtchn)
 {
     return d->evtchn_port_ops->is_busy &&
-           d->evtchn_port_ops->is_busy(d, port);
+           d->evtchn_port_ops->is_busy(d, evtchn);
 }
 
 static inline int evtchn_port_set_priority(struct domain *d,
@@ -233,6 +285,8 @@ static inline int evtchn_port_set_priority(struct domain *d,
 {
     if ( !d->evtchn_port_ops->set_priority )
         return -ENOSYS;
+    if ( !evtchn_usable(evtchn) )
+        return -EACCES;
     return d->evtchn_port_ops->set_priority(d, evtchn, priority);
 }
 
++++++ 5f6a071f-evtchn-arrange-for-preemption-in-evtchn_destroy.patch ++++++
Subject: evtchn: arrange for preemption in evtchn_destroy()
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:15:59 2020 +0200
Date: Tue Sep 22 16:15:59 2020 +0200:
Git: 66cdf341428ae38f6426408a95de9830b5c9c83c

Especially closing of fully established interdomain channels can take
quite some time, due to the locking involved. Therefore we shouldn't
assume we can clean up still active ports all in one go. Besides adding
the necessary preemption check, also avoid pointlessly starting from
(or now really ending at) 0; 1 is the lowest numbered port which may
need closing.

Since we're now reducing ->valid_evtchns, free_xen_event_channel(),
and (at least to be on the safe side) notify_via_xen_event_channel()
need to cope with attempts to close / unbind from / send through already
closed (and no longer valid, as per port_is_valid()) ports.

This is part of XSA-344.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Julien Grall <jgr...@amazon.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>

diff --git a/xen/common/domain.c b/xen/common/domain.c
index e9be05f1d0..364fcbabb2 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -715,12 +715,14 @@ int domain_kill(struct domain *d)
             return domain_kill(d);
         d->is_dying = DOMDYING_dying;
         argo_destroy(d);
-        evtchn_destroy(d);
         gnttab_release_mappings(d);
         vnuma_destroy(d->vnuma);
         domain_set_outstanding_pages(d, 0);
         /* fallthrough */
     case DOMDYING_dying:
+        rc = evtchn_destroy(d);
+        if ( rc )
+            break;
         rc = domain_relinquish_resources(d);
         if ( rc != 0 )
             break;
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 878e4250ed..3b1fe423df 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -1297,7 +1297,16 @@ int alloc_unbound_xen_event_channel(
 
 void free_xen_event_channel(struct domain *d, int port)
 {
-    BUG_ON(!port_is_valid(d, port));
+    if ( !port_is_valid(d, port) )
+    {
+        /*
+         * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
+         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         */
+        smp_rmb();
+        BUG_ON(!d->is_dying);
+        return;
+    }
 
     evtchn_close(d, port, 0);
 }
@@ -1309,7 +1318,17 @@ void notify_via_xen_event_channel(struct domain *ld, int 
lport)
     struct domain *rd;
     unsigned long flags;
 
-    ASSERT(port_is_valid(ld, lport));
+    if ( !port_is_valid(ld, lport) )
+    {
+        /*
+         * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
+         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         */
+        smp_rmb();
+        ASSERT(ld->is_dying);
+        return;
+    }
+
     lchn = evtchn_from_port(ld, lport);
 
     spin_lock_irqsave(&lchn->lock, flags);
@@ -1380,8 +1399,7 @@ int evtchn_init(struct domain *d, unsigned int max_port)
     return 0;
 }
 
-
-void evtchn_destroy(struct domain *d)
+int evtchn_destroy(struct domain *d)
 {
     unsigned int i;
 
@@ -1390,14 +1408,29 @@ void evtchn_destroy(struct domain *d)
     spin_barrier(&d->event_lock);
 
     /* Close all existing event channels. */
-    for ( i = 0; port_is_valid(d, i); i++ )
+    for ( i = d->valid_evtchns; --i; )
+    {
         evtchn_close(d, i, 0);
 
+        /*
+         * Avoid preempting when called from domain_create()'s error path,
+         * and don't check too often (choice of frequency is arbitrary).
+         */
+        if ( i && !(i & 0x3f) && d->is_dying != DOMDYING_dead &&
+             hypercall_preempt_check() )
+        {
+            write_atomic(&d->valid_evtchns, i);
+            return -ERESTART;
+        }
+    }
+
     ASSERT(!d->active_evtchns);
 
     clear_global_virq_handlers(d);
 
     evtchn_fifo_destroy(d);
+
+    return 0;
 }
 
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index b9c70da4b0..57d34b2205 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -138,7 +138,7 @@ struct evtchn
 } __attribute__((aligned(64)));
 
 int  evtchn_init(struct domain *d, unsigned int max_port);
-void evtchn_destroy(struct domain *d); /* from domain_kill */
+int  evtchn_destroy(struct domain *d); /* from domain_kill */
 void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
 
 struct waitqueue_vcpu;
++++++ 5f6a0754-evtchn-arrange-for-preemption-in-evtchn_reset.patch ++++++
Subject: evtchn: arrange for preemption in evtchn_reset()
From: Jan Beulich jbeul...@suse.com Tue Sep 22 16:16:52 2020 +0200
Date: Tue Sep 22 16:16:52 2020 +0200:
Git: 03019c20b516be53ba0cd393f5291974a9a6c9a8

Like for evtchn_destroy() looping over all possible event channels to
close them can take a significant amount of time. Unlike done there, we
can't alter domain properties (i.e. d->valid_evtchns) here. Borrow, in a
lightweight form, the paging domctl continuation concept, redirecting
the continuations to different sub-ops. Just like there this is to be
able to allow for predictable overall results of the involved sub-ops:
Racing requests should either complete or be refused.

Note that a domain can't interfere with an already started (by a remote
domain) reset, due to being paused. It can prevent a remote reset from
happening by leaving a reset unfinished, but that's only going to affect
itself.

This is part of XSA-344.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Julien Grall <jgr...@amazon.com>
Reviewed-by: Stefano Stabellini <sstabell...@kernel.org>

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 364fcbabb2..a275a35c4f 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1145,7 +1145,7 @@ void domain_unpause_except_self(struct domain *d)
         domain_unpause(d);
 }
 
-int domain_soft_reset(struct domain *d)
+int domain_soft_reset(struct domain *d, bool resuming)
 {
     struct vcpu *v;
     int rc;
@@ -1159,7 +1159,7 @@ int domain_soft_reset(struct domain *d)
         }
     spin_unlock(&d->shutdown_lock);
 
-    rc = evtchn_reset(d);
+    rc = evtchn_reset(d, resuming);
     if ( rc )
         return rc;
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index a69b3b59a8..2b25653b49 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -562,12 +562,22 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
u_domctl)
     }
 
     case XEN_DOMCTL_soft_reset:
+    case XEN_DOMCTL_soft_reset_cont:
         if ( d == current->domain ) /* no domain_pause() */
         {
             ret = -EINVAL;
             break;
         }
-        ret = domain_soft_reset(d);
+        ret = domain_soft_reset(d, op->cmd == XEN_DOMCTL_soft_reset_cont);
+        if ( ret == -ERESTART )
+        {
+            op->cmd = XEN_DOMCTL_soft_reset_cont;
+            if ( !__copy_field_to_guest(u_domctl, op, cmd) )
+                ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                    "h", u_domctl);
+            else
+                ret = -EFAULT;
+        }
         break;
 
     case XEN_DOMCTL_destroydomain:
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 3b1fe423df..d2ee83bbbf 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -1057,7 +1057,7 @@ int evtchn_unmask(unsigned int port)
     return 0;
 }
 
-int evtchn_reset(struct domain *d)
+int evtchn_reset(struct domain *d, bool resuming)
 {
     unsigned int i;
     int rc = 0;
@@ -1065,11 +1065,40 @@ int evtchn_reset(struct domain *d)
     if ( d != current->domain && !d->controller_pause_count )
         return -EINVAL;
 
-    for ( i = 0; port_is_valid(d, i); i++ )
+    spin_lock(&d->event_lock);
+
+    /*
+     * If we are resuming, then start where we stopped. Otherwise, check
+     * that a reset operation is not already in progress, and if none is,
+     * record that this is now the case.
+     */
+    i = resuming ? d->next_evtchn : !d->next_evtchn;
+    if ( i > d->next_evtchn )
+        d->next_evtchn = i;
+
+    spin_unlock(&d->event_lock);
+
+    if ( !i )
+        return -EBUSY;
+
+    for ( ; port_is_valid(d, i); i++ )
+    {
         evtchn_close(d, i, 1);
 
+        /* NB: Choice of frequency is arbitrary. */
+        if ( !(i & 0x3f) && hypercall_preempt_check() )
+        {
+            spin_lock(&d->event_lock);
+            d->next_evtchn = i;
+            spin_unlock(&d->event_lock);
+            return -ERESTART;
+        }
+    }
+
     spin_lock(&d->event_lock);
 
+    d->next_evtchn = 0;
+
     if ( d->active_evtchns > d->xen_evtchns )
         rc = -EAGAIN;
     else if ( d->evtchn_fifo )
@@ -1204,7 +1233,8 @@ long do_event_channel_op(int cmd, 
XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
     }
 
-    case EVTCHNOP_reset: {
+    case EVTCHNOP_reset:
+    case EVTCHNOP_reset_cont: {
         struct evtchn_reset reset;
         struct domain *d;
 
@@ -1217,9 +1247,13 @@ long do_event_channel_op(int cmd, 
XEN_GUEST_HANDLE_PARAM(void) arg)
 
         rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d);
         if ( !rc )
-            rc = evtchn_reset(d);
+            rc = evtchn_reset(d, cmd == EVTCHNOP_reset_cont);
 
         rcu_unlock_domain(d);
+
+        if ( rc == -ERESTART )
+            rc = hypercall_create_continuation(__HYPERVISOR_event_channel_op,
+                                               "ih", EVTCHNOP_reset_cont, arg);
         break;
     }
 
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 59bdc28c89..f416722491 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -1159,7 +1159,10 @@ struct xen_domctl {
 #define XEN_DOMCTL_iomem_permission              20
 #define XEN_DOMCTL_ioport_permission             21
 #define XEN_DOMCTL_hypercall_init                22
-#define XEN_DOMCTL_arch_setup                    23 /* Obsolete IA64 only */
+#ifdef __XEN__
+/* #define XEN_DOMCTL_arch_setup                 23 Obsolete IA64 only */
+#define XEN_DOMCTL_soft_reset_cont               23
+#endif
 #define XEN_DOMCTL_settimeoffset                 24
 #define XEN_DOMCTL_getvcpuaffinity               25
 #define XEN_DOMCTL_real_mode_area                26 /* Obsolete PPC only */
diff --git a/xen/include/public/event_channel.h 
b/xen/include/public/event_channel.h
index cfb7929fef..b896d8da2a 100644
--- a/xen/include/public/event_channel.h
+++ b/xen/include/public/event_channel.h
@@ -74,6 +74,9 @@
 #define EVTCHNOP_init_control    11
 #define EVTCHNOP_expand_array    12
 #define EVTCHNOP_set_priority    13
+#ifdef __XEN__
+#define EVTCHNOP_reset_cont      14
+#endif
 /* ` } */
 
 typedef uint32_t evtchn_port_t;
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index bc9aa68650..fa93a3684a 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -179,7 +179,7 @@ void evtchn_check_pollers(struct domain *d, unsigned int 
port);
 void evtchn_2l_init(struct domain *d);
 
 /* Close all event channels and reset to 2-level ABI. */
-int evtchn_reset(struct domain *d);
+int evtchn_reset(struct domain *d, bool resuming);
 
 /*
  * Low-level event channel port ops.
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 57d34b2205..a0d87ef9d0 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -371,6 +371,8 @@ struct domain
      * EVTCHNOP_reset).  Read/write access like for active_evtchns.
      */
     unsigned int     xen_evtchns;
+    /* Port to resume from in evtchn_reset(), when in a continuation. */
+    unsigned int     next_evtchn;
     spinlock_t       event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;
@@ -663,7 +665,7 @@ int domain_kill(struct domain *d);
 int domain_shutdown(struct domain *d, u8 reason);
 void domain_resume(struct domain *d);
 
-int domain_soft_reset(struct domain *d);
+int domain_soft_reset(struct domain *d, bool resuming);
 
 int vcpu_start_shutdown_deferral(struct vcpu *v);
 void vcpu_end_shutdown_deferral(struct vcpu *v);

commit xen for openSUSE:Factory

Reply via email to