commit xen for openSUSE:Factory

root Mon, 03 Dec 2018 01:05:19 -0800

Hello community,

here is the log from the commit of package xen for openSUSE:Factory checked in 
at 2018-12-03 10:04:05
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/xen (Old)
 and      /work/SRC/openSUSE:Factory/.xen.new.19453 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "xen"

Mon Dec  3 10:04:05 2018 rev:257 rq:652068 version:4.11.0_09

Changes:
--------
--- /work/SRC/openSUSE:Factory/xen/xen.changes  2018-11-13 16:23:56.974832220 
+0100
+++ /work/SRC/openSUSE:Factory/.xen.new.19453/xen.changes       2018-12-03 
10:04:29.840033854 +0100
@@ -1,0 +2,58 @@
+Wed Nov 21 15:44:39 MST 2018 - carn...@suse.com
+
+- bsc#1116524 - Package xen-tools-4.11.0_09-2.1.x86_64 broken:
+  Missing /bin/domu-xenstore.  This was broken because "make
+  package build reproducible" change. (boo#1047218, boo#1062303)
+  This fix reverses the change to this patch.
+  tmp_build.patch
+
+-------------------------------------------------------------------
+Mon Nov 12 09:47:39 MST 2018 - carn...@suse.com
+
+- bsc#1115040 - VUL-0: xen: insufficient TLB flushing / improper
+  large page mappings with AMD IOMMUs (XSA-275)
+  xsa275-1.patch
+  xsa275-2.patch
+- bsc#1115043 - VUL-0: xen: resource accounting issues in x86 IOREQ
+  server handling (XSA-276)
+  xsa276-1.patch
+  xsa276-2.patch
+- bsc#1115044 - VUL-0: xen: x86: incorrect error handling for guest
+  p2m page removals (XSA-277)
+  xsa277.patch
+- bsc#1114405 - VUL-0: CVE-2018-18883: xen: Nested VT-x usable even
+  when disabled (XSA-278)
+  5bd0e11b-x86-disallow-VT-x-insns-without-nested-virt.patch
+- bsc#1115045 - VUL-0: xen: x86: DoS from attempting to use INVPCID
+  with a non-canonical addresses (XSA-279)
+  xsa279.patch
+- bsc#1115047 - VUL-0: xen: Fix for XSA-240 conflicts with shadow
+  paging (XSA-280)
+  xsa280-1.patch
+  xsa280-2.patch
+- bsc#1114988 - VUL-0: xen: guest use of HLE constructs may lock up
+  host (XSA-282)
+  5be2a308-x86-extend-get_platform_badpages.patch
+  5be2a354-x86-work-around-HLE-host-lockup-erratum.patch
+- bsc#1108940 - L3: XEN SLE12-SP1 domU hang on SLE12-SP3 HV
+  5bdc31d5-VMX-fix-vmx_handle_eoi.patch
+- Upstream bug fixes (bsc#1027519)
+  5b752762-x86-hvm-emul-rep-IO-should-not-cross-GFN-boundaries.patch
+  5ba11ed4-credit2-fix-moving-CPUs-between-cpupools.patch
+  5bacae4b-x86-boot-allocate-extra-module-slot.patch
+  5bae44ce-x86-silence-false-log-messages.patch
+  5bb60c12-x86-split-opt_xpti.patch
+  5bb60c4f-x86-split-opt_pv_l1tf.patch
+  5bb60c74-x86-fix-xpti-and-pv-l1tf.patch
+  5bcf0722-x86-boot-enable-NMIs.patch
+  5bd076e9-dombuilder-init-vcpu-debug-regs-correctly.patch
+  5bd076e9-x86-boot-init-debug-regs-correctly.patch
+  5bd076e9-x86-init-vcpu-debug-regs-correctly.patch
+  5bd85bfd-x86-fix-crash-on-xl-set-parameter-pcid.patch
+
+-------------------------------------------------------------------
+Wed Oct 24 20:08:24 UTC 2018 - oher...@suse.de
+
+- Use SMBIOS_REL_DATE instead of SMBIOS_DATE for reproducible binaries
+
+-------------------------------------------------------------------

New:
----
  5b752762-x86-hvm-emul-rep-IO-should-not-cross-GFN-boundaries.patch
  5ba11ed4-credit2-fix-moving-CPUs-between-cpupools.patch
  5bacae4b-x86-boot-allocate-extra-module-slot.patch
  5bae44ce-x86-silence-false-log-messages.patch
  5bb60c12-x86-split-opt_xpti.patch
  5bb60c4f-x86-split-opt_pv_l1tf.patch
  5bb60c74-x86-fix-xpti-and-pv-l1tf.patch
  5bcf0722-x86-boot-enable-NMIs.patch
  5bd076e9-dombuilder-init-vcpu-debug-regs-correctly.patch
  5bd076e9-x86-boot-init-debug-regs-correctly.patch
  5bd076e9-x86-init-vcpu-debug-regs-correctly.patch
  5bd0e11b-x86-disallow-VT-x-insns-without-nested-virt.patch
  5bd85bfd-x86-fix-crash-on-xl-set-parameter-pcid.patch
  5bdc31d5-VMX-fix-vmx_handle_eoi.patch
  5be2a308-x86-extend-get_platform_badpages.patch
  5be2a354-x86-work-around-HLE-host-lockup-erratum.patch
  xsa275-1.patch
  xsa275-2.patch
  xsa276-1.patch
  xsa276-2.patch
  xsa277.patch
  xsa279.patch
  xsa280-1.patch
  xsa280-2.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ xen.spec ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:31.752032107 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:31.760032101 +0100
@@ -205,18 +205,42 @@
 Patch41:        
5b72fbbf-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitigations.patch
 Patch42:        5b72fbbf-xl.conf-Add-global-affinity-masks.patch
 Patch43:        5b74190e-x86-hvm-ioreq-MMIO-range-check-honor-DF.patch
-Patch44:        5b75afef-x86-setup-avoid-OoB-E820-lookup.patch
-Patch45:        5b76b780-rangeset-inquiry-functions-tolerate-NULL.patch
-Patch46:        5b83c654-VT-d-dmar-iommu-mem-leak-fix.patch
-Patch47:        5b8d5832-x86-assorted-array_index_nospec-insertions.patch
-Patch48:        
5b8fae26-tools-libxl-correct-vcpu-affinity-output-with-sparse-physical-cpu-map.patch
-Patch49:        5b8fae26-xen-fill-topology-info-for-all-present-cpus.patch
-Patch50:        
5b8fb5af-tools-xl-refuse-to-set-number-of-vcpus-to-0-via-xl-vcpu-set.patch
-Patch51:        5b9784ad-x86-HVM-drop-hvm_fetch_from_guest_linear.patch
-Patch52:        5b9784d2-x86-HVM-add-known_gla-helper.patch
-Patch53:        5b9784f2-x86-HVM-split-page-straddling-accesses.patch
-Patch98:        xen.b8f33431f3dd23fb43a879f4bdb4283fdc9465ad.patch
-Patch99:        xen.2b50cdbc444c637575580dcfa6c9525a84d5cc62.patch
+Patch44:        
5b752762-x86-hvm-emul-rep-IO-should-not-cross-GFN-boundaries.patch
+Patch45:        5b75afef-x86-setup-avoid-OoB-E820-lookup.patch
+Patch46:        5b76b780-rangeset-inquiry-functions-tolerate-NULL.patch
+Patch47:        5b83c654-VT-d-dmar-iommu-mem-leak-fix.patch
+Patch48:        5b8d5832-x86-assorted-array_index_nospec-insertions.patch
+Patch49:        
5b8fae26-tools-libxl-correct-vcpu-affinity-output-with-sparse-physical-cpu-map.patch
+Patch50:        5b8fae26-xen-fill-topology-info-for-all-present-cpus.patch
+Patch51:        
5b8fb5af-tools-xl-refuse-to-set-number-of-vcpus-to-0-via-xl-vcpu-set.patch
+Patch52:        5b9784ad-x86-HVM-drop-hvm_fetch_from_guest_linear.patch
+Patch53:        5b9784d2-x86-HVM-add-known_gla-helper.patch
+Patch54:        5b9784f2-x86-HVM-split-page-straddling-accesses.patch
+Patch55:        5ba11ed4-credit2-fix-moving-CPUs-between-cpupools.patch
+Patch56:        5bacae4b-x86-boot-allocate-extra-module-slot.patch
+Patch57:        5bae44ce-x86-silence-false-log-messages.patch
+Patch58:        5bb60c12-x86-split-opt_xpti.patch
+Patch59:        5bb60c4f-x86-split-opt_pv_l1tf.patch
+Patch60:        5bb60c74-x86-fix-xpti-and-pv-l1tf.patch
+Patch61:        5bcf0722-x86-boot-enable-NMIs.patch
+Patch62:        5bd076e9-x86-boot-init-debug-regs-correctly.patch
+Patch63:        5bd076e9-x86-init-vcpu-debug-regs-correctly.patch
+Patch64:        5bd076e9-dombuilder-init-vcpu-debug-regs-correctly.patch
+Patch65:        5bd0e11b-x86-disallow-VT-x-insns-without-nested-virt.patch
+Patch66:        5bd85bfd-x86-fix-crash-on-xl-set-parameter-pcid.patch
+Patch67:        5bdc31d5-VMX-fix-vmx_handle_eoi.patch
+Patch68:        5be2a308-x86-extend-get_platform_badpages.patch
+Patch69:        5be2a354-x86-work-around-HLE-host-lockup-erratum.patch
+Patch70:        xsa275-1.patch
+Patch71:        xsa275-2.patch
+Patch72:        xsa276-1.patch
+Patch73:        xsa276-2.patch
+Patch74:        xsa277.patch
+Patch75:        xsa279.patch
+Patch76:        xsa280-1.patch
+Patch77:        xsa280-2.patch
+Patch78:        xen.b8f33431f3dd23fb43a879f4bdb4283fdc9465ad.patch
+Patch79:        xen.2b50cdbc444c637575580dcfa6c9525a84d5cc62.patch
 # Our platform specific patches
 Patch400:       xen-destdir.patch
 Patch401:       vif-bridge-no-iptables.patch
@@ -465,8 +489,32 @@
 %patch51 -p1
 %patch52 -p1
 %patch53 -p1
-%patch98 -p1
-%patch99 -p1
+%patch54 -p1
+%patch55 -p1
+%patch56 -p1
+%patch57 -p1
+%patch58 -p1
+%patch59 -p1
+%patch60 -p1
+%patch61 -p1
+%patch62 -p1
+%patch63 -p1
+%patch64 -p1
+%patch65 -p1
+%patch66 -p1
+%patch67 -p1
+%patch68 -p1
+%patch69 -p1
+%patch70 -p1
+%patch71 -p1
+%patch72 -p1
+%patch73 -p1
+%patch74 -p1
+%patch75 -p1
+%patch76 -p1
+%patch77 -p1
+%patch78 -p1
+%patch79 -p1
 # Our platform specific patches
 %patch400 -p1
 %patch401 -p1
@@ -540,13 +588,13 @@
 XEN_FULLVERSION="$XEN_VERSION.$XEN_SUBVERSION.$XEN_EXTRAVERSION"
 XEN_BUILD_DATE="`date -u -d '1970-01-01'`"
 XEN_BUILD_TIME="`date -u -d '1970-01-01' +%%T`"
-SMBIOS_DATE="`date -u -d '1970-01-01' +%%m/%%d/%%Y`"
+SMBIOS_REL_DATE="`date -u -d '1970-01-01' +%%m/%%d/%%Y`"
 RELDATE="`date -u -d '1970-01-01' '+%%d %%b %%Y'`"
 if test -r %{S:9}
 then
        XEN_BUILD_DATE="` date -u -d \"$(sed -n '/@/{s/ - .*$//p;q}' %{S:9})\" 
`"
        XEN_BUILD_TIME="` date -u -d \"$(sed -n '/@/{s/ - .*$//p;q}' %{S:9})\" 
+%%T`"
-       SMBIOS_DATE="` date -u -d \"$(sed -n '/@/{s/ - .*$//p;q}' %{S:9})\" 
+%%m/%%d/%%Y`"
+       SMBIOS_REL_DATE="` date -u -d \"$(sed -n '/@/{s/ - .*$//p;q}' %{S:9})\" 
+%%m/%%d/%%Y`"
        RELDATE="` date -u -d \"$(sed -n '/@/{s/ - .*$//p;q}' %{S:9})\" '+%%d 
%%b %%Y'`"
 fi
 cat > .our_xenversion <<_EOV_
@@ -555,7 +603,7 @@
 export GIT=$(type -P false)
 export EXTRA_CFLAGS_XEN_TOOLS="$RPM_OPT_FLAGS"
 export EXTRA_CFLAGS_QEMU_TRADITIONAL="$RPM_OPT_FLAGS"
-export SMBIOS_REL_DATE="$SMBIOS_DATE"
+export SMBIOS_REL_DATE="$SMBIOS_REL_DATE"
 export RELDATE="$RELDATE"
 XEN_VERSION=$XEN_VERSION
 XEN_SUBVERSION=$XEN_SUBVERSION

++++++ 5b752762-x86-hvm-emul-rep-IO-should-not-cross-GFN-boundaries.patch ++++++
# Commit 7626edeaca972e3e823535dcc44338f6b2f0b21f
# Date 2018-08-16 09:27:30 +0200
# Author Paul Durrant <paul.durr...@citrix.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86/hvm/emulate: make sure rep I/O emulation does not cross GFN boundaries

When emulating a rep I/O operation it is possible that the ioreq will
describe a single operation that spans multiple GFNs. This is fine as long
as all those GFNs fall within an MMIO region covered by a single device
model, but unfortunately the higher levels of the emulation code do not
guarantee that. This is something that should almost certainly be fixed,
but in the meantime this patch makes sure that MMIO is truncated at GFN
boundaries and hence the appropriate device model is re-evaluated for each
target GFN.

NOTE: This patch does not deal with the case of a single MMIO operation
      spanning a GFN boundary. That is more complex to deal with and is
      deferred to a subsequent patch.

Signed-off-by: Paul Durrant <paul.durr...@citrix.com>

Convert calculations to be 32-bit only.

Signed-off-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -184,6 +184,24 @@ static int hvmemul_do_io(
         hvmtrace_io_assist(&p);
     }
 
+    /*
+     * Make sure that we truncate rep MMIO at any GFN boundary. This is
+     * necessary to ensure that the correct device model is targetted
+     * or that we correctly handle a rep op spanning MMIO and RAM.
+     */
+    if ( unlikely(p.count > 1) && p.type == IOREQ_TYPE_COPY )
+    {
+        unsigned int off = p.addr & ~PAGE_MASK;
+        unsigned int tail = PAGE_SIZE - off;
+
+        if ( tail < p.size ) /* single rep spans GFN */
+            p.count = 1;
+        else
+            p.count = min(p.count,
+                          (p.df ? (off + p.size) : tail) / p.size);
+    }
+    ASSERT(p.count);
+
     vio->io_req = p;
 
     rc = hvm_io_intercept(&p);
++++++ 5b9784ad-x86-HVM-drop-hvm_fetch_from_guest_linear.patch ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:31.932031943 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:31.932031943 +0100
@@ -17,7 +17,7 @@
 
 --- a/xen/arch/x86/hvm/emulate.c
 +++ b/xen/arch/x86/hvm/emulate.c
-@@ -1046,6 +1046,8 @@ static int __hvmemul_read(
+@@ -1064,6 +1064,8 @@ static int __hvmemul_read(
          pfec |= PFEC_implicit;
      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
          pfec |= PFEC_user_mode;
@@ -26,7 +26,7 @@
  
      rc = hvmemul_virtual_to_linear(
          seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
-@@ -1057,9 +1059,7 @@ static int __hvmemul_read(
+@@ -1075,9 +1077,7 @@ static int __hvmemul_read(
           (vio->mmio_gla == (addr & PAGE_MASK)) )
          return hvmemul_linear_mmio_read(addr, bytes, p_data, pfec, 
hvmemul_ctxt, 1);
  
@@ -37,7 +37,7 @@
  
      switch ( rc )
      {
-@@ -2498,9 +2498,10 @@ void hvm_emulate_init_per_insn(
+@@ -2516,9 +2516,10 @@ void hvm_emulate_init_per_insn(
                                          hvm_access_insn_fetch,
                                          &hvmemul_ctxt->seg_reg[x86_seg_cs],
                                          &addr) &&

++++++ 5b9784d2-x86-HVM-add-known_gla-helper.patch ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:31.948031929 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:31.948031929 +0100
@@ -14,7 +14,7 @@
 
 --- a/xen/arch/x86/hvm/emulate.c
 +++ b/xen/arch/x86/hvm/emulate.c
-@@ -1027,6 +1027,26 @@ static inline int hvmemul_linear_mmio_wr
+@@ -1045,6 +1045,26 @@ static inline int hvmemul_linear_mmio_wr
                                        pfec, hvmemul_ctxt, translate);
  }
  
@@ -41,7 +41,7 @@
  static int __hvmemul_read(
      enum x86_segment seg,
      unsigned long offset,
-@@ -1035,11 +1055,9 @@ static int __hvmemul_read(
+@@ -1053,11 +1073,9 @@ static int __hvmemul_read(
      enum hvm_access_type access_type,
      struct hvm_emulate_ctxt *hvmemul_ctxt)
  {
@@ -53,7 +53,7 @@
      int rc;
  
      if ( is_x86_system_segment(seg) )
-@@ -1053,10 +1071,7 @@ static int __hvmemul_read(
+@@ -1071,10 +1089,7 @@ static int __hvmemul_read(
          seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
      if ( rc != X86EMUL_OKAY || !bytes )
          return rc;
@@ -65,7 +65,7 @@
          return hvmemul_linear_mmio_read(addr, bytes, p_data, pfec, 
hvmemul_ctxt, 1);
  
      rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
-@@ -1157,10 +1172,8 @@ static int hvmemul_write(
+@@ -1175,10 +1190,8 @@ static int hvmemul_write(
  {
      struct hvm_emulate_ctxt *hvmemul_ctxt =
          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
@@ -76,7 +76,7 @@
      int rc;
      void *mapping;
  
-@@ -1174,8 +1187,7 @@ static int hvmemul_write(
+@@ -1192,8 +1205,7 @@ static int hvmemul_write(
      if ( rc != X86EMUL_OKAY || !bytes )
          return rc;
  
@@ -86,7 +86,7 @@
          return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec, 
hvmemul_ctxt, 1);
  
      mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
-@@ -1204,7 +1216,6 @@ static int hvmemul_rmw(
+@@ -1222,7 +1234,6 @@ static int hvmemul_rmw(
          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
      unsigned long addr, reps = 1;
      uint32_t pfec = PFEC_page_present | PFEC_write_access;
@@ -94,7 +94,7 @@
      int rc;
      void *mapping;
  
-@@ -1230,8 +1241,7 @@ static int hvmemul_rmw(
+@@ -1248,8 +1259,7 @@ static int hvmemul_rmw(
      else
      {
          unsigned long data = 0;

++++++ 5b9784f2-x86-HVM-split-page-straddling-accesses.patch ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:31.956031921 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:31.956031921 +0100
@@ -26,7 +26,7 @@
 
 --- a/xen/arch/x86/hvm/emulate.c
 +++ b/xen/arch/x86/hvm/emulate.c
-@@ -1044,7 +1044,91 @@ static bool known_gla(unsigned long addr
+@@ -1062,7 +1062,91 @@ static bool known_gla(unsigned long addr
      else if ( !vio->mmio_access.read_access )
              return false;
  
@@ -119,7 +119,7 @@
  }
  
  static int __hvmemul_read(
-@@ -1055,7 +1139,6 @@ static int __hvmemul_read(
+@@ -1073,7 +1157,6 @@ static int __hvmemul_read(
      enum hvm_access_type access_type,
      struct hvm_emulate_ctxt *hvmemul_ctxt)
  {
@@ -127,7 +127,7 @@
      unsigned long addr, reps = 1;
      uint32_t pfec = PFEC_page_present;
      int rc;
-@@ -1071,31 +1154,8 @@ static int __hvmemul_read(
+@@ -1089,31 +1172,8 @@ static int __hvmemul_read(
          seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
      if ( rc != X86EMUL_OKAY || !bytes )
          return rc;
@@ -160,7 +160,7 @@
  }
  
  static int hvmemul_read(
-@@ -1175,7 +1235,7 @@ static int hvmemul_write(
+@@ -1193,7 +1253,7 @@ static int hvmemul_write(
      unsigned long addr, reps = 1;
      uint32_t pfec = PFEC_page_present | PFEC_write_access;
      int rc;
@@ -169,7 +169,7 @@
  
      if ( is_x86_system_segment(seg) )
          pfec |= PFEC_implicit;
-@@ -1187,15 +1247,15 @@ static int hvmemul_write(
+@@ -1205,15 +1265,15 @@ static int hvmemul_write(
      if ( rc != X86EMUL_OKAY || !bytes )
          return rc;
  
@@ -192,7 +192,7 @@
  
      memcpy(mapping, p_data, bytes);
  
-@@ -1217,7 +1277,7 @@ static int hvmemul_rmw(
+@@ -1235,7 +1295,7 @@ static int hvmemul_rmw(
      unsigned long addr, reps = 1;
      uint32_t pfec = PFEC_page_present | PFEC_write_access;
      int rc;
@@ -201,7 +201,7 @@
  
      rc = hvmemul_virtual_to_linear(
          seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
-@@ -1229,9 +1289,12 @@ static int hvmemul_rmw(
+@@ -1247,9 +1307,12 @@ static int hvmemul_rmw(
      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
          pfec |= PFEC_user_mode;
  
@@ -217,7 +217,7 @@
  
      if ( mapping )
      {
-@@ -1241,17 +1304,14 @@ static int hvmemul_rmw(
+@@ -1259,17 +1322,14 @@ static int hvmemul_rmw(
      else
      {
          unsigned long data = 0;

++++++ 5ba11ed4-credit2-fix-moving-CPUs-between-cpupools.patch ++++++
# Commit 6e395f477fb854f11de83a951a070d3aacb6dc59
# Date 2018-09-18 16:50:44 +0100
# Author Dario Faggioli <dfaggi...@suse.com>
# Committer George Dunlap <george.dun...@citrix.com>
xen: sched/Credit2: fix bug when moving CPUs between two Credit2 cpupools

Whether or not a CPU is assigned to a runqueue (and, if yes, to which
one) within a Credit2 scheduler instance must be both a per-cpu and
per-scheduler instance one.

In fact, when we move a CPU between cpupools, we first setup its per-cpu
data in the new pool, and then cleanup its per-cpu data from the old
pool. In Credit2, when there currently is no per-scheduler, per-cpu
data (as the cpu-to-runqueue map is stored on a per-cpu basis only),
this means that the cleanup of the old per-cpu data can mess with the
new per-cpu data, leading to crashes like this:

https://www.mail-archive.com/xen-devel@lists.xenproject.org/msg23306.html
https://www.mail-archive.com/xen-devel@lists.xenproject.org/msg23350.html

Basically, when csched2_deinit_pdata() is called for CPU 13, for fully
removing the CPU from Pool-0, per_cpu(13,runq_map) already contain the
id of the runqueue to which the CPU has been assigned in the scheduler
of Pool-1, which means wrong runqueue manipulations happen in Pool-0's
scheduler. Furthermore, at the end of such call, that same runq_map is
updated with -1, which is what causes the BUG_ON in csched2_schedule(),
on CPU 13, to trigger.

So, instead of reverting a2c4e5ab59d "xen: credit2: make the cpu to
runqueue map per-cpu" (as we don't want to go back to having the huge
array in struct csched2_private) add a per-cpu scheduler specific data
structure, like, for instance, Credit1 has already. That (for now) only
contains one field: the id of the runqueue the CPU is assigned to.

Signed-off-by: Dario Faggioli <dfaggi...@suse.com>
Reviewed-by: Juergen Gross <jgr...@suse.com>
Reviewed-by: George Dunlap <george.dun...@citrix.com>

--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -508,11 +508,10 @@ struct csched2_private {
 
 /*
  * Physical CPU
- *
- * The only per-pCPU information we need to maintain is of which runqueue
- * each CPU is part of.
  */
-static DEFINE_PER_CPU(int, runq_map);
+struct csched2_pcpu {
+    int runq_id;
+};
 
 /*
  * Virtual CPU
@@ -571,6 +570,11 @@ static inline struct csched2_private *cs
     return ops->sched_data;
 }
 
+static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
+{
+    return per_cpu(schedule_data, cpu).sched_priv;
+}
+
 static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v)
 {
     return v->sched_priv;
@@ -584,7 +588,7 @@ static inline struct csched2_dom *csched
 /* CPU to runq_id macro */
 static inline int c2r(unsigned int cpu)
 {
-    return per_cpu(runq_map, cpu);
+    return csched2_pcpu(cpu)->runq_id;
 }
 
 /* CPU to runqueue struct macro */
@@ -3778,31 +3782,45 @@ csched2_dump(const struct scheduler *ops
 #undef cpustr
 }
 
+static void *
+csched2_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct csched2_pcpu *spc;
+
+    spc = xzalloc(struct csched2_pcpu);
+    if ( spc == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Not in any runqueue yet */
+    spc->runq_id = -1;
+
+    return spc;
+}
+
 /* Returns the ID of the runqueue the cpu is assigned to. */
 static unsigned
-init_pdata(struct csched2_private *prv, unsigned int cpu)
+init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
+           unsigned int cpu)
 {
-    unsigned rqi;
     struct csched2_runqueue_data *rqd;
 
     ASSERT(rw_is_write_locked(&prv->lock));
     ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+    /* CPU data needs to be allocated, but still uninitialized. */
+    ASSERT(spc && spc->runq_id == -1);
 
     /* Figure out which runqueue to put it in */
-    rqi = cpu_to_runqueue(prv, cpu);
+    spc->runq_id = cpu_to_runqueue(prv, cpu);
 
-    rqd = prv->rqd + rqi;
+    rqd = prv->rqd + spc->runq_id;
 
-    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi);
-    if ( ! cpumask_test_cpu(rqi, &prv->active_queues) )
+    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
+    if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
     {
         printk(XENLOG_INFO " First cpu on runqueue, activating\n");
-        activate_runqueue(prv, rqi);
+        activate_runqueue(prv, spc->runq_id);
     }
     
-    /* Set the runqueue map */
-    per_cpu(runq_map, cpu) = rqi;
-    
     __cpumask_set_cpu(cpu, &rqd->idle);
     __cpumask_set_cpu(cpu, &rqd->active);
     __cpumask_set_cpu(cpu, &prv->initialized);
@@ -3811,7 +3829,7 @@ init_pdata(struct csched2_private *prv,
     if ( cpumask_weight(&rqd->active) == 1 )
         rqd->pick_bias = cpu;
 
-    return rqi;
+    return spc->runq_id;
 }
 
 static void
@@ -3822,16 +3840,10 @@ csched2_init_pdata(const struct schedule
     unsigned long flags;
     unsigned rqi;
 
-    /*
-     * pdata contains what alloc_pdata returned. But since we don't (need to)
-     * implement alloc_pdata, either that's NULL, or something is very wrong!
-     */
-    ASSERT(!pdata);
-
     write_lock_irqsave(&prv->lock, flags);
     old_lock = pcpu_schedule_lock(cpu);
 
-    rqi = init_pdata(prv, cpu);
+    rqi = init_pdata(prv, pdata, cpu);
     /* Move the scheduler lock to the new runq lock. */
     per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
 
@@ -3849,7 +3861,7 @@ csched2_switch_sched(struct scheduler *n
     struct csched2_vcpu *svc = vdata;
     unsigned rqi;
 
-    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
+    ASSERT(pdata && svc && is_idle_vcpu(svc->vcpu));
 
     /*
      * We own one runqueue lock already (from schedule_cpu_switch()). This
@@ -3864,7 +3876,7 @@ csched2_switch_sched(struct scheduler *n
 
     idle_vcpu[cpu]->sched_priv = vdata;
 
-    rqi = init_pdata(prv, cpu);
+    rqi = init_pdata(prv, pdata, cpu);
 
     /*
      * Now that we know what runqueue we'll go in, double check what's said
@@ -3875,7 +3887,7 @@ csched2_switch_sched(struct scheduler *n
     ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);
 
     per_cpu(scheduler, cpu) = new_ops;
-    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+    per_cpu(schedule_data, cpu).sched_priv = pdata;
 
     /*
      * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
@@ -3894,7 +3906,7 @@ csched2_deinit_pdata(const struct schedu
     unsigned long flags;
     struct csched2_private *prv = csched2_priv(ops);
     struct csched2_runqueue_data *rqd;
-    int rqi;
+    struct csched2_pcpu *spc = pcpu;
 
     write_lock_irqsave(&prv->lock, flags);
 
@@ -3902,17 +3914,24 @@ csched2_deinit_pdata(const struct schedu
      * alloc_pdata is not implemented, so pcpu must be NULL. On the other
      * hand, init_pdata must have been called for this pCPU.
      */
-    ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized));
+    /*
+     * Scheduler specific data for this pCPU must still be there and and be
+     * valid. In fact, if we are here:
+     *  1. alloc_pdata must have been called for this cpu, and free_pdata
+     *     must not have been called on it before us,
+     *  2. init_pdata must have been called on this cpu, and deinit_pdata
+     *     (us!) must not have been called on it already.
+     */
+    ASSERT(spc && spc->runq_id != -1);
+    ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
     
     /* Find the old runqueue and remove this cpu from it */
-    rqi = per_cpu(runq_map, cpu);
-
-    rqd = prv->rqd + rqi;
+    rqd = prv->rqd + spc->runq_id;
 
     /* No need to save IRQs here, they're already disabled */
     spin_lock(&rqd->lock);
 
-    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi);
+    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, 
spc->runq_id);
 
     __cpumask_clear_cpu(cpu, &rqd->idle);
     __cpumask_clear_cpu(cpu, &rqd->smt_idle);
@@ -3921,12 +3940,12 @@ csched2_deinit_pdata(const struct schedu
     if ( cpumask_empty(&rqd->active) )
     {
         printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
-        deactivate_runqueue(prv, rqi);
+        deactivate_runqueue(prv, spc->runq_id);
     }
     else if ( rqd->pick_bias == cpu )
         rqd->pick_bias = cpumask_first(&rqd->active);
 
-    per_cpu(runq_map, cpu) = -1;
+    spc->runq_id = -1;
 
     spin_unlock(&rqd->lock);
 
@@ -3937,6 +3956,24 @@ csched2_deinit_pdata(const struct schedu
     return;
 }
 
+static void
+csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched2_pcpu *spc = pcpu;
+
+    /*
+     * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
+     * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
+     * xfree() does not really mind, but we want to be sure that either
+     * init_pdata has never been called, or deinit_pdata has been called
+     * already.
+     */
+    ASSERT(!pcpu || spc->runq_id == -1);
+    ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
+
+    xfree(pcpu);
+}
+
 static int
 csched2_init(struct scheduler *ops)
 {
@@ -4052,8 +4089,10 @@ static const struct scheduler sched_cred
     .deinit         = csched2_deinit,
     .alloc_vdata    = csched2_alloc_vdata,
     .free_vdata     = csched2_free_vdata,
+    .alloc_pdata    = csched2_alloc_pdata,
     .init_pdata     = csched2_init_pdata,
     .deinit_pdata   = csched2_deinit_pdata,
+    .free_pdata     = csched2_free_pdata,
     .switch_sched   = csched2_switch_sched,
     .alloc_domdata  = csched2_alloc_domdata,
     .free_domdata   = csched2_free_domdata,
++++++ 5bacae4b-x86-boot-allocate-extra-module-slot.patch ++++++
# Commit 4c5f9dbebc0bd2afee1ecd936c74ffe65756950f
# Date 2018-09-27 11:17:47 +0100
# Author Daniel Kiper <daniel.ki...@oracle.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/boot: Allocate one extra module slot for Xen image placement

Commit 9589927 (x86/mb2: avoid Xen image when looking for
module/crashkernel position) fixed relocation issues for
Multiboot2 protocol. Unfortunately it missed to allocate
module slot for Xen image placement in early boot path.
So, let's fix it right now.

Reported-by: Wei Liu <wei.l...@citrix.com>
Signed-off-by: Daniel Kiper <daniel.ki...@oracle.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/boot/reloc.c
+++ b/xen/arch/x86/boot/reloc.c
@@ -177,7 +177,12 @@ static multiboot_info_t *mbi2_reloc(u32
     if ( mbi_out->mods_count )
     {
         mbi_out->flags |= MBI_MODULES;
-        mbi_out->mods_addr = alloc_mem(mbi_out->mods_count * 
sizeof(*mbi_out_mods));
+        /*
+         * We have to allocate one more module slot here. At some point
+         * __start_xen() may put Xen image placement into it.
+         */
+        mbi_out->mods_addr = alloc_mem((mbi_out->mods_count + 1) *
+                                       sizeof(*mbi_out_mods));
         mbi_out_mods = _p(mbi_out->mods_addr);
     }
 
++++++ 5bae44ce-x86-silence-false-log-messages.patch ++++++
# Commit 2fb57e4beefeda923446b73f88b392e59b07d847
# Date 2018-09-28 17:12:14 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: silence false log messages for plain "xpti" / "pv-l1tf"

While commit 2a3b34ec47 ("x86/spec-ctrl: Yet more fixes for xpti=
parsing")  claimed to have got rid of the 'parameter "xpti" has invalid
value "", rc=-22!' log message for "xpti" alone on the command line,
this wasn't the case (the option took effect nevertheless).

Fix this there as well as for plain "pv-l1tf".

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -256,7 +256,7 @@ static __init int parse_pv_l1tf(const ch
             else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
                 opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
                                (val ? OPT_PV_L1TF_DOMU : 0));
-            else
+            else if ( *s )
                 rc = -EINVAL;
             break;
         }
@@ -707,7 +707,7 @@ static __init int parse_xpti(const char
             else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
                 opt_xpti = (opt_xpti & ~OPT_XPTI_DOMU) |
                            (val ? OPT_XPTI_DOMU : 0);
-            else
+            else if ( *s )
                 rc = -EINVAL;
             break;
         }
++++++ 5bb60c12-x86-split-opt_xpti.patch ++++++
# Commit 51e0cb45932d80d4eeb59994ee2c3f3c597b0212
# Date 2018-10-04 14:48:18 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: split opt_xpti

Use separate tracking variables for the hardware domain and DomU-s.

No functional change intended.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -259,8 +259,8 @@ int pv_domain_initialise(struct domain *
     /* 64-bit PV guest by default. */
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
-    d->arch.pv_domain.xpti = opt_xpti & (is_hardware_domain(d)
-                                         ? OPT_XPTI_DOM0 : OPT_XPTI_DOMU);
+    d->arch.pv_domain.xpti = is_hardware_domain(d) ? opt_xpti_hwdom
+                                                   : opt_xpti_domu;
 
     if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
         switch ( opt_pcid )
--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -134,8 +134,10 @@ static int __init parse_spec_ctrl(const
 
             opt_eager_fpu = 0;
 
-            if ( opt_xpti < 0 )
-                opt_xpti = 0;
+            if ( opt_xpti_hwdom < 0 )
+                opt_xpti_hwdom = 0;
+            if ( opt_xpti_domu < 0 )
+                opt_xpti_domu = 0;
 
             if ( opt_smt < 0 )
                 opt_smt = 1;
@@ -343,8 +345,8 @@ static void __init print_details(enum in
            opt_eager_fpu                             ? " EAGER_FPU"     : "");
 
     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s\n",
-           opt_xpti & OPT_XPTI_DOM0 ? "enabled" : "disabled",
-           opt_xpti & OPT_XPTI_DOMU ? "enabled" : "disabled");
+           opt_xpti_hwdom ? "enabled" : "disabled",
+           opt_xpti_domu  ? "enabled" : "disabled");
 
     printk("  PV L1TF shadowing: Dom0 %s, DomU %s\n",
            opt_pv_l1tf & OPT_PV_L1TF_DOM0  ? "enabled"  : "disabled",
@@ -657,7 +659,8 @@ static __init void l1tf_calculations(uin
                                             : (3ul << (paddr_bits - 2))));
 }
 
-int8_t __read_mostly opt_xpti = -1;
+int8_t __read_mostly opt_xpti_hwdom = -1;
+int8_t __read_mostly opt_xpti_domu = -1;
 
 static __init void xpti_init_default(uint64_t caps)
 {
@@ -665,9 +668,19 @@ static __init void xpti_init_default(uin
         caps = ARCH_CAPABILITIES_RDCL_NO;
 
     if ( caps & ARCH_CAPABILITIES_RDCL_NO )
-        opt_xpti = 0;
+    {
+        if ( opt_xpti_hwdom < 0 )
+            opt_xpti_hwdom = 0;
+        if ( opt_xpti_domu < 0 )
+            opt_xpti_domu = 0;
+    }
     else
-        opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
+    {
+        if ( opt_xpti_hwdom < 0 )
+            opt_xpti_hwdom = 1;
+        if ( opt_xpti_domu < 0 )
+            opt_xpti_domu = 1;
+    }
 }
 
 static __init int parse_xpti(const char *s)
@@ -676,12 +689,14 @@ static __init int parse_xpti(const char
     int val, rc = 0;
 
     /* Inhibit the defaults as an explicit choice has been given. */
-    if ( opt_xpti == -1 )
-        opt_xpti = 0;
+    if ( opt_xpti_hwdom == -1 )
+        opt_xpti_hwdom = 0;
+    if ( opt_xpti_domu == -1 )
+        opt_xpti_domu = 0;
 
     /* Interpret 'xpti' alone in its positive boolean form. */
     if ( *s == '\0' )
-        opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
+        opt_xpti_hwdom = opt_xpti_domu = 1;
 
     do {
         ss = strchr(s, ',');
@@ -691,22 +706,20 @@ static __init int parse_xpti(const char
         switch ( parse_bool(s, ss) )
         {
         case 0:
-            opt_xpti = 0;
+            opt_xpti_hwdom = opt_xpti_domu = 0;
             break;
 
         case 1:
-            opt_xpti = OPT_XPTI_DOM0 | OPT_XPTI_DOMU;
+            opt_xpti_hwdom = opt_xpti_domu = 1;
             break;
 
         default:
             if ( !strcmp(s, "default") )
-                opt_xpti = -1;
+                opt_xpti_hwdom = opt_xpti_domu = -1;
             else if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
-                opt_xpti = (opt_xpti & ~OPT_XPTI_DOM0) |
-                           (val ? OPT_XPTI_DOM0 : 0);
+                opt_xpti_hwdom = val;
             else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
-                opt_xpti = (opt_xpti & ~OPT_XPTI_DOMU) |
-                           (val ? OPT_XPTI_DOMU : 0);
+                opt_xpti_domu = val;
             else if ( *s )
                 rc = -EINVAL;
             break;
@@ -862,10 +875,9 @@ void __init init_speculation_mitigations
     if ( default_xen_spec_ctrl )
         setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
 
-    if ( opt_xpti == -1 )
-        xpti_init_default(caps);
+    xpti_init_default(caps);
 
-    if ( opt_xpti == 0 )
+    if ( !opt_xpti_hwdom && !opt_xpti_domu )
         setup_force_cpu_cap(X86_FEATURE_NO_XPTI);
     else
         setup_clear_cpu_cap(X86_FEATURE_NO_XPTI);
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -35,9 +35,7 @@ extern bool bsp_delay_spec_ctrl;
 extern uint8_t default_xen_spec_ctrl;
 extern uint8_t default_spec_ctrl_flags;
 
-extern int8_t opt_xpti;
-#define OPT_XPTI_DOM0  0x01
-#define OPT_XPTI_DOMU  0x02
+extern int8_t opt_xpti_hwdom, opt_xpti_domu;
 
 extern int8_t opt_pv_l1tf;
 #define OPT_PV_L1TF_DOM0  0x01
++++++ 5bb60c4f-x86-split-opt_pv_l1tf.patch ++++++
# Commit 0b89643ef6ef14e2c2b731ca675d23e405ed69b1
# Date 2018-10-04 14:49:19 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: split opt_pv_l1tf

Use separate tracking variables for the hardware domain and DomU-s.

No functional change intended, but adjust the comment in
init_speculation_mitigations() to match prior as well as resulting code.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -142,8 +142,10 @@ static int __init parse_spec_ctrl(const
             if ( opt_smt < 0 )
                 opt_smt = 1;
 
-            if ( opt_pv_l1tf < 0 )
-                opt_pv_l1tf = 0;
+            if ( opt_pv_l1tf_hwdom < 0 )
+                opt_pv_l1tf_hwdom = 0;
+            if ( opt_pv_l1tf_domu < 0 )
+                opt_pv_l1tf_domu = 0;
 
         disable_common:
             opt_rsb_pv = false;
@@ -221,7 +223,8 @@ static int __init parse_spec_ctrl(const
 }
 custom_param("spec-ctrl", parse_spec_ctrl);
 
-int8_t __read_mostly opt_pv_l1tf = -1;
+int8_t __read_mostly opt_pv_l1tf_hwdom = -1;
+int8_t __read_mostly opt_pv_l1tf_domu = -1;
 
 static __init int parse_pv_l1tf(const char *s)
 {
@@ -229,12 +232,14 @@ static __init int parse_pv_l1tf(const ch
     int val, rc = 0;
 
     /* Inhibit the defaults as an explicit choice has been given. */
-    if ( opt_pv_l1tf == -1 )
-        opt_pv_l1tf = 0;
+    if ( opt_pv_l1tf_hwdom == -1 )
+        opt_pv_l1tf_hwdom = 0;
+    if ( opt_pv_l1tf_domu == -1 )
+        opt_pv_l1tf_domu = 0;
 
     /* Interpret 'pv-l1tf' alone in its positive boolean form. */
     if ( *s == '\0' )
-        opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+        opt_pv_l1tf_hwdom = opt_pv_l1tf_domu = 1;
 
     do {
         ss = strchr(s, ',');
@@ -244,20 +249,18 @@ static __init int parse_pv_l1tf(const ch
         switch ( parse_bool(s, ss) )
         {
         case 0:
-            opt_pv_l1tf = 0;
+            opt_pv_l1tf_hwdom = opt_pv_l1tf_domu = 0;
             break;
 
         case 1:
-            opt_pv_l1tf = OPT_PV_L1TF_DOM0 | OPT_PV_L1TF_DOMU;
+            opt_pv_l1tf_hwdom = opt_pv_l1tf_domu = 1;
             break;
 
         default:
             if ( (val = parse_boolean("dom0", s, ss)) >= 0 )
-                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOM0) |
-                               (val ? OPT_PV_L1TF_DOM0 : 0));
+                opt_pv_l1tf_hwdom = val;
             else if ( (val = parse_boolean("domu", s, ss)) >= 0 )
-                opt_pv_l1tf = ((opt_pv_l1tf & ~OPT_PV_L1TF_DOMU) |
-                               (val ? OPT_PV_L1TF_DOMU : 0));
+                opt_pv_l1tf_domu = val;
             else if ( *s )
                 rc = -EINVAL;
             break;
@@ -320,7 +323,7 @@ static void __init print_details(enum in
            opt_l1d_flush                             ? " L1D_FLUSH" : "");
 
     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
-    if ( cpu_has_bug_l1tf || opt_pv_l1tf )
+    if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu )
         printk("  L1TF: believed%s vulnerable, maxphysaddr L1D %u, CPUID %u"
                ", Safe address %"PRIx64"\n",
                cpu_has_bug_l1tf ? "" : " not",
@@ -349,8 +352,8 @@ static void __init print_details(enum in
            opt_xpti_domu  ? "enabled" : "disabled");
 
     printk("  PV L1TF shadowing: Dom0 %s, DomU %s\n",
-           opt_pv_l1tf & OPT_PV_L1TF_DOM0  ? "enabled"  : "disabled",
-           opt_pv_l1tf & OPT_PV_L1TF_DOMU  ? "enabled"  : "disabled");
+           opt_pv_l1tf_hwdom ? "enabled"  : "disabled",
+           opt_pv_l1tf_domu  ? "enabled"  : "disabled");
 }
 
 /* Calculate whether Retpoline is known-safe on this CPU. */
@@ -891,13 +894,10 @@ void __init init_speculation_mitigations
      * In shim mode, SHADOW is expected to be compiled out, and a malicious
      * guest kernel can only attack the shim Xen, not the host Xen.
      */
-    if ( opt_pv_l1tf == -1 )
-    {
-        if ( pv_shim || !cpu_has_bug_l1tf )
-            opt_pv_l1tf = 0;
-        else
-            opt_pv_l1tf = OPT_PV_L1TF_DOMU;
-    }
+    if ( opt_pv_l1tf_hwdom == -1 )
+        opt_pv_l1tf_hwdom = 0;
+    if ( opt_pv_l1tf_domu == -1 )
+        opt_pv_l1tf_domu = !pv_shim && cpu_has_bug_l1tf;
 
     /*
      * By default, enable L1D_FLUSH on L1TF-vulnerable hardware, unless
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -224,9 +224,8 @@ void pv_l1tf_tasklet(unsigned long data)
 
 static inline void pv_l1tf_domain_init(struct domain *d)
 {
-    d->arch.pv_domain.check_l1tf =
-        opt_pv_l1tf & (is_hardware_domain(d)
-                       ? OPT_PV_L1TF_DOM0 : OPT_PV_L1TF_DOMU);
+    d->arch.pv_domain.check_l1tf = is_hardware_domain(d) ? opt_pv_l1tf_hwdom
+                                                         : opt_pv_l1tf_domu;
 
 #if defined(CONFIG_SHADOW_PAGING) && defined(CONFIG_PV)
     tasklet_init(&d->arch.paging.shadow.pv_l1tf_tasklet,
--- a/xen/include/asm-x86/spec_ctrl.h
+++ b/xen/include/asm-x86/spec_ctrl.h
@@ -37,9 +37,7 @@ extern uint8_t default_spec_ctrl_flags;
 
 extern int8_t opt_xpti_hwdom, opt_xpti_domu;
 
-extern int8_t opt_pv_l1tf;
-#define OPT_PV_L1TF_DOM0  0x01
-#define OPT_PV_L1TF_DOMU  0x02
+extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu;
 
 /*
  * The L1D address mask, which might be wider than reported in CPUID, and the
++++++ 5bb60c74-x86-fix-xpti-and-pv-l1tf.patch ++++++
# Commit 8743d2dea539617e237c77556a91dc357098a8af
# Date 2018-10-04 14:49:56 +0200
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: fix "xpti=" and "pv-l1tf=" yet again

While commit 2a3b34ec47 ("x86/spec-ctrl: Yet more fixes for xpti=
parsing") indeed fixed "xpti=dom0", it broke "xpti=no-dom0", in that
this then became equivalent to "xpti=no". In particular, the presence
of "xpti=" alone on the command line means nothing as to which default
is to be overridden; "xpti=no-dom0", for example, ought to have no
effect for DomU-s, as this is distinct from both "xpti=no-dom0,domu"
and "xpti=no-dom0,no-domu".

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/spec_ctrl.c
+++ b/xen/arch/x86/spec_ctrl.c
@@ -231,12 +231,6 @@ static __init int parse_pv_l1tf(const ch
     const char *ss;
     int val, rc = 0;
 
-    /* Inhibit the defaults as an explicit choice has been given. */
-    if ( opt_pv_l1tf_hwdom == -1 )
-        opt_pv_l1tf_hwdom = 0;
-    if ( opt_pv_l1tf_domu == -1 )
-        opt_pv_l1tf_domu = 0;
-
     /* Interpret 'pv-l1tf' alone in its positive boolean form. */
     if ( *s == '\0' )
         opt_pv_l1tf_hwdom = opt_pv_l1tf_domu = 1;
@@ -691,12 +685,6 @@ static __init int parse_xpti(const char
     const char *ss;
     int val, rc = 0;
 
-    /* Inhibit the defaults as an explicit choice has been given. */
-    if ( opt_xpti_hwdom == -1 )
-        opt_xpti_hwdom = 0;
-    if ( opt_xpti_domu == -1 )
-        opt_xpti_domu = 0;
-
     /* Interpret 'xpti' alone in its positive boolean form. */
     if ( *s == '\0' )
         opt_xpti_hwdom = opt_xpti_domu = 1;
++++++ 5bcf0722-x86-boot-enable-NMIs.patch ++++++
# Commit 072e054359a4d4a4f6c3fa09585667472c4f0f1d
# Date 2018-10-23 12:33:54 +0100
# Author Sergey Dyasli <sergey.dya...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/boot: enable NMIs after traps init

In certain scenarios, NMIs might be disabled during Xen boot process.
Such situation will cause alternative_instructions() to:

    panic("Timed out waiting for alternatives self-NMI to hit\n");

This bug was originally seen when using Tboot to boot Xen 4.11

To prevent this from happening, enable NMIs during cpu_init() and
during __start_xen() for BSP.

Signed-off-by: Sergey Dyasli <sergey.dya...@citrix.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -831,6 +831,9 @@ void cpu_init(void)
 #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
        CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 #undef CD
+
+       /* Enable NMIs.  Our loader (e.g. Tboot) may have left them disabled. */
+       enable_nmis();
 }
 
 void cpu_uninit(unsigned int cpu)
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -702,6 +702,9 @@ void __init noreturn __start_xen(unsigne
 
     /* Full exception support from here on in. */
 
+    /* Enable NMIs.  Our loader (e.g. Tboot) may have left them disabled. */
+    enable_nmis();
+
     if ( pvh_boot )
     {
         ASSERT(mbi_p == 0);
++++++ 5bd076e9-dombuilder-init-vcpu-debug-regs-correctly.patch ++++++
# Commit 46029da12e5efeca6d957e5793bd34f2965fa0a1
# Date 2018-10-24 14:43:05 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
tools/dombuilder: Initialise vcpu debug registers correctly

In particular, initialising %dr6 with the value 0 is buggy, because on
hardware supporting Transactional Memory, it will cause the sticky RTM bit to
be asserted, even though a debug exception from a transaction hasn't actually
been observed.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau Monné <roger....@citrix.com>
Acked-by: Wei Liu <wei.l...@citrix.com>

--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -53,6 +53,9 @@
 #define X86_CR0_PE 0x01
 #define X86_CR0_ET 0x10
 
+#define X86_DR6_DEFAULT 0xffff0ff0u
+#define X86_DR7_DEFAULT 0x00000400u
+
 #define SPECIALPAGE_PAGING   0
 #define SPECIALPAGE_ACCESS   1
 #define SPECIALPAGE_SHARING  2
@@ -860,6 +863,9 @@ static int vcpu_x86_32(struct xc_dom_ima
         dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
     ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */
 
+    ctxt->debugreg[6] = X86_DR6_DEFAULT;
+    ctxt->debugreg[7] = X86_DR7_DEFAULT;
+
     ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32;
     if ( dom->parms.pae == XEN_PAE_EXTCR3 ||
          dom->parms.pae == XEN_PAE_BIMODAL )
@@ -907,6 +913,9 @@ static int vcpu_x86_64(struct xc_dom_ima
         dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
     ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */
 
+    ctxt->debugreg[6] = X86_DR6_DEFAULT;
+    ctxt->debugreg[7] = X86_DR7_DEFAULT;
+
     ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64;
     cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
     ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn);
@@ -1011,6 +1020,9 @@ static int vcpu_hvm(struct xc_dom_image
     /* Set the IP. */
     bsp_ctx.cpu.rip = dom->parms.phys_entry;
 
+    bsp_ctx.cpu.dr6 = X86_DR6_DEFAULT;
+    bsp_ctx.cpu.dr7 = X86_DR7_DEFAULT;
+
     if ( dom->start_info_seg.pfn )
         bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT;
 
++++++ 5bd076e9-x86-boot-init-debug-regs-correctly.patch ++++++
# Commit 721da6d41a70fe08b3fcd9c31a62f6709a54c6ba
# Date 2018-10-24 14:43:05 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/boot: Initialise the debug registers correctly

In particular, initialising %dr6 with the value 0 is buggy, because on
hardware supporting Transactional Memory, it will cause the sticky RTM bit to
be asserted, even though a debug exception from a transaction hasn't actually
been observed.

Move X86_DR6_DEFAULT into x86-defns.h along with the other architectural
register constants, and introduce a new X86_DR7_DEFAULT.  Use the existing
write_debugreg() helper, rather than opencoded inline assembly.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau Monné <roger....@citrix.com>

--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -3,6 +3,7 @@
 #include <xen/delay.h>
 #include <xen/smp.h>
 #include <asm/current.h>
+#include <asm/debugreg.h>
 #include <asm/processor.h>
 #include <asm/xstate.h>
 #include <asm/msr.h>
@@ -827,10 +828,13 @@ void cpu_init(void)
        /* Ensure FPU gets initialised for each domain. */
        stts();
 
-       /* Clear all 6 debug registers: */
-#define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
-       CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
-#undef CD
+       /* Reset debug registers: */
+       write_debugreg(0, 0);
+       write_debugreg(1, 0);
+       write_debugreg(2, 0);
+       write_debugreg(3, 0);
+       write_debugreg(6, X86_DR6_DEFAULT);
+       write_debugreg(7, X86_DR7_DEFAULT);
 
        /* Enable NMIs.  Our loader (e.g. Tboot) may have left them disabled. */
        enable_nmis();
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -24,8 +24,6 @@
 #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
 #define DR_STATUS_RESERVED_ONE  0xffff0ff0ul /* Reserved, read as one */
 
-#define X86_DR6_DEFAULT 0xffff0ff0ul    /* Default %dr6 value. */
-
 /* Now define a bunch of things for manipulating the control register.
    The top two bytes of the control register consist of 4 fields of 4
    bits - each field corresponds to one of the four debug registers,
--- a/xen/include/asm-x86/x86-defns.h
+++ b/xen/include/asm-x86/x86-defns.h
@@ -97,4 +97,14 @@
 #define X86_XCR0_LWP_POS          62
 #define X86_XCR0_LWP              (1ULL << X86_XCR0_LWP_POS)
 
+/*
+ * Debug status flags in DR6.
+ */
+#define X86_DR6_DEFAULT         0xffff0ff0  /* Default %dr6 value. */
+
+/*
+ * Debug control flags in DR7.
+ */
+#define X86_DR7_DEFAULT         0x00000400  /* Default %dr7 value. */
+
 #endif /* __XEN_X86_DEFNS_H__ */
++++++ 5bd076e9-x86-init-vcpu-debug-regs-correctly.patch ++++++
# Commit dfba4d2e91f63a8f40493c4fc2db03fd8287f6cb
# Date 2018-10-24 14:43:05 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/domain: Initialise vcpu debug registers correctly

In particular, initialising %dr6 with the value 0 is buggy, because on
hardware supporting Transactional Memory, it will cause the sticky RTM bit to
be asserted, even though a debug exception from a transaction hasn't actually
been observed.

Introduce arch_vcpu_regs_init() to set various architectural defaults, and
reuse this in the hvm_vcpu_reset_state() path.

Architecturally, %edx's init state contains the processors model information,
and 0xf looks to be a remnant of the old Intel processors.  We clearly have no
software which cares, seeing as it is wrong for the last decade's worth of
Intel hardware and for all other vendors, so lets use the value 0 for
simplicity.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Roger Pau Monné <roger....@citrix.com>

# Commit 0a1fa635029d100d4b6b7eddb31d49603217cab7
# Date 2018-10-30 13:26:21 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/domain: Fix build with GCC 4.3.x

GCC 4.3.x can't initialise the user_regs structure like this.

Reported-by: Jan Beulich <jbeul...@suse.com>
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Wei Liu <wei.l...@citrix.com>
Acked-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -323,6 +323,17 @@ void free_vcpu_struct(struct vcpu *v)
     free_xenheap_page(v);
 }
 
+/* Initialise various registers to their architectural INIT/RESET state. */
+void arch_vcpu_regs_init(struct vcpu *v)
+{
+    memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs));
+    v->arch.user_regs.eflags = X86_EFLAGS_MBS;
+
+    memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg));
+    v->arch.debugreg[6] = X86_DR6_DEFAULT;
+    v->arch.debugreg[7] = X86_DR7_DEFAULT;
+}
+
 int vcpu_initialise(struct vcpu *v)
 {
     struct domain *d = v->domain;
@@ -342,6 +353,8 @@ int vcpu_initialise(struct vcpu *v)
             return rc;
 
         vmce_init_vcpu(v);
+
+        arch_vcpu_regs_init(v);
     }
     else if ( (rc = xstate_alloc_save_area(v)) != 0 )
         return rc;
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3896,11 +3896,9 @@ void hvm_vcpu_reset_state(struct vcpu *v
     }
 
     v->arch.vgc_flags = VGCF_online;
-    memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs));
-    v->arch.user_regs.rflags = X86_EFLAGS_MBS;
-    v->arch.user_regs.rdx = 0x00000f00;
+
+    arch_vcpu_regs_init(v);
     v->arch.user_regs.rip = ip;
-    memset(&v->arch.debugreg, 0, sizeof(v->arch.debugreg));
 
     v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET;
     hvm_update_guest_cr(v, 0);
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -633,6 +633,8 @@ static inline void free_vcpu_guest_conte
     vfree(vgc);
 }
 
+void arch_vcpu_regs_init(struct vcpu *v);
+
 struct vcpu_hvm_context;
 int arch_set_info_hvm_guest(struct vcpu *v, const struct vcpu_hvm_context 
*ctx);
 
++++++ 5bd0e11b-x86-disallow-VT-x-insns-without-nested-virt.patch ++++++
# Commit 35cd5ba367515ffbd274ca529c5e946447f4ba48
# Date 2018-10-24 22:16:11 +0100
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/vvmx: Disallow the use of VT-x instructions when nested virt is disabled

c/s ac6a4500b "vvmx: set vmxon_region_pa of vcpu out of VMX operation to an
invalid address" was a real bugfix as described, but has a very subtle bug
which results in all VT-x instructions being usable by a guest.

The toolstack constructs a guest by issuing:

  XEN_DOMCTL_createdomain
  XEN_DOMCTL_max_vcpus

and optionally later, HVMOP_set_param to enable nested virt.

As a result, the call to nvmx_vcpu_initialise() in hvm_vcpu_initialise()
(which is what makes the above patch look correct during review) is actually
dead code.  In practice, nvmx_vcpu_initialise() first gets called when nested
virt is enabled, which is typically never.

As a result, the zeroed memory of struct vcpu causes nvmx_vcpu_in_vmx() to
return true before nested virt is enabled for the guest.

Fixing the order of initialisation is a work in progress for other reasons,
but not viable for security backports.

A compounding factor is that the vmexit handlers for all instructions, other
than VMXON, pass 0 into vmx_inst_check_privilege()'s vmxop_check parameter,
which skips the CR4.VMXE check.  (This is one of many reasons why nested virt
isn't a supported feature yet.)

However, the overall result is that when nested virt is not enabled by the
toolstack (i.e. the default configuration for all production guests), the VT-x
instructions (other than VMXON) are actually usable, and Xen very quickly
falls over the fact that the nvmx structure is uninitialised.

In order to fail safe in the supported case, re-implement all the VT-x
instruction handling using a single function with a common prologue, covering
all the checks which should cause #UD or #GP faults.  This deliberately
doesn't use any state from the nvmx structure, in case there are other lurking
issues.

This is XSA-278

Reported-by: Sergey Dyasli <sergey.dya...@citrix.com>
Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Sergey Dyasli <sergey.dya...@citrix.com>

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3982,57 +3982,17 @@ void vmx_vmexit_handler(struct cpu_user_
         break;
 
     case EXIT_REASON_VMXOFF:
-        if ( nvmx_handle_vmxoff(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMXON:
-        if ( nvmx_handle_vmxon(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMCLEAR:
-        if ( nvmx_handle_vmclear(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
- 
     case EXIT_REASON_VMPTRLD:
-        if ( nvmx_handle_vmptrld(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMPTRST:
-        if ( nvmx_handle_vmptrst(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMREAD:
-        if ( nvmx_handle_vmread(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
- 
     case EXIT_REASON_VMWRITE:
-        if ( nvmx_handle_vmwrite(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMLAUNCH:
-        if ( nvmx_handle_vmlaunch(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_VMRESUME:
-        if ( nvmx_handle_vmresume(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_INVEPT:
-        if ( nvmx_handle_invept(regs) == X86EMUL_OKAY )
-            update_guest_eip();
-        break;
-
     case EXIT_REASON_INVVPID:
-        if ( nvmx_handle_invvpid(regs) == X86EMUL_OKAY )
+        if ( nvmx_handle_vmx_insn(regs, exit_reason) == X86EMUL_OKAY )
             update_guest_eip();
         break;
 
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1470,7 +1470,7 @@ void nvmx_switch_guest(void)
  * VMX instructions handling
  */
 
-int nvmx_handle_vmxon(struct cpu_user_regs *regs)
+static int nvmx_handle_vmxon(struct cpu_user_regs *regs)
 {
     struct vcpu *v=current;
     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
@@ -1522,7 +1522,7 @@ int nvmx_handle_vmxon(struct cpu_user_re
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
+static int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
 {
     struct vcpu *v=current;
     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
@@ -1611,7 +1611,7 @@ static int nvmx_vmresume(struct vcpu *v,
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmresume(struct cpu_user_regs *regs)
+static int nvmx_handle_vmresume(struct cpu_user_regs *regs)
 {
     bool_t launched;
     struct vcpu *v = current;
@@ -1645,7 +1645,7 @@ int nvmx_handle_vmresume(struct cpu_user
     return nvmx_vmresume(v,regs);
 }
 
-int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
+static int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
 {
     bool_t launched;
     struct vcpu *v = current;
@@ -1688,7 +1688,7 @@ int nvmx_handle_vmlaunch(struct cpu_user
     return rc;
 }
 
-int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
+static int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
@@ -1759,7 +1759,7 @@ out:
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
+static int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
@@ -1784,7 +1784,7 @@ int nvmx_handle_vmptrst(struct cpu_user_
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmclear(struct cpu_user_regs *regs)
+static int nvmx_handle_vmclear(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
@@ -1836,7 +1836,7 @@ int nvmx_handle_vmclear(struct cpu_user_
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmread(struct cpu_user_regs *regs)
+static int nvmx_handle_vmread(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
@@ -1878,7 +1878,7 @@ int nvmx_handle_vmread(struct cpu_user_r
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
+static int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     struct vmx_inst_decoded decode;
@@ -1926,7 +1926,7 @@ int nvmx_handle_vmwrite(struct cpu_user_
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_invept(struct cpu_user_regs *regs)
+static int nvmx_handle_invept(struct cpu_user_regs *regs)
 {
     struct vmx_inst_decoded decode;
     unsigned long eptp;
@@ -1954,7 +1954,7 @@ int nvmx_handle_invept(struct cpu_user_r
     return X86EMUL_OKAY;
 }
 
-int nvmx_handle_invvpid(struct cpu_user_regs *regs)
+static int nvmx_handle_invvpid(struct cpu_user_regs *regs)
 {
     struct vmx_inst_decoded decode;
     unsigned long vpid;
@@ -1980,6 +1980,81 @@ int nvmx_handle_invvpid(struct cpu_user_
     return X86EMUL_OKAY;
 }
 
+int nvmx_handle_vmx_insn(struct cpu_user_regs *regs, unsigned int exit_reason)
+{
+    struct vcpu *curr = current;
+    int ret;
+
+    if ( !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) ||
+         !nestedhvm_enabled(curr->domain) ||
+         (vmx_guest_x86_mode(curr) < (hvm_long_mode_active(curr) ? 8 : 2)) )
+    {
+        hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
+        return X86EMUL_EXCEPTION;
+    }
+
+    if ( vmx_get_cpl() > 0 )
+    {
+        hvm_inject_hw_exception(TRAP_gp_fault, 0);
+        return X86EMUL_EXCEPTION;
+    }
+
+    switch ( exit_reason )
+    {
+    case EXIT_REASON_VMXOFF:
+        ret = nvmx_handle_vmxoff(regs);
+        break;
+
+    case EXIT_REASON_VMXON:
+        ret = nvmx_handle_vmxon(regs);
+        break;
+
+    case EXIT_REASON_VMCLEAR:
+        ret = nvmx_handle_vmclear(regs);
+        break;
+
+    case EXIT_REASON_VMPTRLD:
+        ret = nvmx_handle_vmptrld(regs);
+        break;
+
+    case EXIT_REASON_VMPTRST:
+        ret = nvmx_handle_vmptrst(regs);
+        break;
+
+    case EXIT_REASON_VMREAD:
+        ret = nvmx_handle_vmread(regs);
+        break;
+
+    case EXIT_REASON_VMWRITE:
+        ret = nvmx_handle_vmwrite(regs);
+        break;
+
+    case EXIT_REASON_VMLAUNCH:
+        ret = nvmx_handle_vmlaunch(regs);
+        break;
+
+    case EXIT_REASON_VMRESUME:
+        ret = nvmx_handle_vmresume(regs);
+        break;
+
+    case EXIT_REASON_INVEPT:
+        ret = nvmx_handle_invept(regs);
+        break;
+
+    case EXIT_REASON_INVVPID:
+        ret = nvmx_handle_invvpid(regs);
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        domain_crash(curr->domain);
+        ret = X86EMUL_UNHANDLEABLE;
+        break;
+    }
+
+    return ret;
+}
+
 #define __emul_value(enable1, default1) \
     ((enable1 | default1) << 32 | (default1))
 
--- a/xen/include/asm-x86/hvm/vmx/vvmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vvmx.h
@@ -94,9 +94,6 @@ void nvmx_domain_relinquish_resources(st
 
 bool_t nvmx_ept_enabled(struct vcpu *v);
 
-int nvmx_handle_vmxon(struct cpu_user_regs *regs);
-int nvmx_handle_vmxoff(struct cpu_user_regs *regs);
-
 #define EPT_TRANSLATE_SUCCEED       0
 #define EPT_TRANSLATE_VIOLATION     1
 #define EPT_TRANSLATE_MISCONFIG     2
@@ -191,15 +188,7 @@ enum vmx_insn_errno set_vvmcs_real_safe(
 uint64_t get_shadow_eptp(struct vcpu *v);
 
 void nvmx_destroy_vmcs(struct vcpu *v);
-int nvmx_handle_vmptrld(struct cpu_user_regs *regs);
-int nvmx_handle_vmptrst(struct cpu_user_regs *regs);
-int nvmx_handle_vmclear(struct cpu_user_regs *regs);
-int nvmx_handle_vmread(struct cpu_user_regs *regs);
-int nvmx_handle_vmwrite(struct cpu_user_regs *regs);
-int nvmx_handle_vmresume(struct cpu_user_regs *regs);
-int nvmx_handle_vmlaunch(struct cpu_user_regs *regs);
-int nvmx_handle_invept(struct cpu_user_regs *regs);
-int nvmx_handle_invvpid(struct cpu_user_regs *regs);
+int nvmx_handle_vmx_insn(struct cpu_user_regs *regs, unsigned int exit_reason);
 int nvmx_msr_read_intercept(unsigned int msr,
                                 u64 *msr_content);
 
++++++ 5bd85bfd-x86-fix-crash-on-xl-set-parameter-pcid.patch ++++++
# Commit f993c3e90728705dacd834b49a6e5608c1360409
# Date 2018-10-30 13:26:21 +0000
# Author Andrew Cooper <andrew.coop...@citrix.com>
# Committer Andrew Cooper <andrew.coop...@citrix.com>
x86/pv: Fix crash when using `xl set-parameter pcid=...`

"pcid=" is registered as a runtime parameter, which means that parse_pcid()
must not reside in .init, or the following happens when parse_params() tries
to call an unmapped function pointer.

  (XEN) ----[ Xen-4.12-unstable  x86_64  debug=y   Not tainted ]----
  (XEN) CPU:    0
  (XEN) RIP:    e008:[<ffff82d080407fb3>] ffff82d080407fb3
  (XEN) RFLAGS: 0000000000010292   CONTEXT: hypervisor (d0v1)
  (XEN) rax: ffff82d080407fb3   rbx: ffff82d0803cf270   rcx: 0000000000000000
  (XEN) rdx: ffff8300abe67fff   rsi: 000000000000000a   rdi: ffff8300abe67bfd
  (XEN) rbp: ffff8300abe67ca8   rsp: ffff8300abe67ba0   r8:  ffff83084d980000
  (XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
  (XEN) r12: ffff8300abe67bfd   r13: ffff82d0803cb628   r14: 0000000000000000
  (XEN) r15: ffff8300abe67bf8   cr0: 0000000080050033   cr4: 0000000000172660
  (XEN) cr3: 0000000828efd000   cr2: ffff82d080407fb3
  (XEN) fsb: 00007fb810d4b780   gsb: ffff88007ce20000   gss: 0000000000000000
  (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
  (XEN) Xen code around <ffff82d080407fb3> (ffff82d080407fb3) [fault on access]:
  (XEN)  -- -- -- -- -- -- -- -- <--> -- -- -- -- -- -- -- -- -- -- -- -- -- -- 
--
  (XEN) Xen stack trace from rsp=ffff8300abe67ba0:
  (XEN)    ffff82d080217f61 ffff830826db0f09 ffff8300abe67bf8 ffff82d0803cf1e0
  (XEN)    00007cff54198409 ffff8300abe67bf0 010001d000000000 0000000000000000
  (XEN)    ffff82d0803cf288 ffff8300abe67c88 ffff82d0805a09c0 616c620064696370
  (XEN)    00000000aaaa0068 0000000000000296 ffff82d08023d60e aaaaaaaaaaaaaaaa
  (XEN)    ffff83084d9b4000 ffff8300abe67c68 ffff82d08024940e ffff83083736e000
  (XEN)    0000000000000080 000000000000007a 000000000000000a ffff82d08045e61c
  (XEN)    ffff82d080573d80 ffff8300abe67cb8 ffff82d080249805 80000007fce54067
  (XEN)    fffffffffffffff2 ffff830826db0f00 ffff8300abfa7000 ffff82d08045e61c
  (XEN)    ffff82d080573d80 ffff8300abe67cb8 ffff82d08021801e ffff8300abe67e48
  (XEN)    ffff82d08023f60a ffff83083736e000 0000000000000000 ffff8300abe67d58
  (XEN)    ffff82d080293d90 0000000000000092 ffff82d08023d60e ffff820040006ae0
  (XEN)    0000000000000000 0000000000000000 00007fb810d5c010 ffff83083736e248
  (XEN)    0000000000000286 ffff8300abe67d58 0000000000000000 ffff82e010521b00
  (XEN)    0000000000000206 0000000000000000 0000000000000000 ffff8300abe67e48
  (XEN)    ffff82d080295270 00000000ffffffff ffff83083736e000 ffff8300abe67e48
  (XEN)    ffff820040006ae0 ffff8300abe67d98 000000120000001c 00007fb810d5d010
  (XEN)    0000000000000009 0000000000000002 0000000000000001 00007fb810b53260
  (XEN)    0000000000000001 0000000000000000 0000000000638bc0 00007fb81066a748
  (XEN)    00007ffe11087881 0000000000000002 0000000000000001 00007fb810b53260
  (XEN)    0000000000638b60 0000000000000000 00007fb8100322a0 ffff82d08035d444
  (XEN) Xen call trace:
  (XEN)    [<ffff82d080217f61>] kernel.c#parse_params+0x34a/0x3eb
  (XEN)    [<ffff82d08021801e>] runtime_parse+0x1c/0x1e
  (XEN)    [<ffff82d08023f60a>] do_sysctl+0x108d/0x1241
  (XEN)    [<ffff82d0803535cb>] pv_hypercall+0x1ac/0x4c5
  (XEN)    [<ffff82d08035d4a2>] lstar_enter+0x112/0x120
  (XEN)
  (XEN) Pagetable walk from ffff82d080407fb3:
  (XEN)  L4[0x105] = 00000000abe5c063 ffffffffffffffff
  (XEN)  L3[0x142] = 00000000abe59063 ffffffffffffffff
  (XEN)  L2[0x002] = 000000084d9bf063 ffffffffffffffff
  (XEN)  L1[0x007] = 0000000000000000 ffffffffffffffff
  (XEN)
  (XEN) ****************************************
  (XEN) Panic on CPU 0:
  (XEN) FATAL PAGE FAULT
  (XEN) [error_code=0010]
  (XEN) Faulting linear address: ffff82d080407fb3
  (XEN) ****************************************

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -22,7 +22,7 @@ static __read_mostly enum {
     PCID_NOXPTI
 } opt_pcid = PCID_XPTI;
 
-static __init int parse_pcid(const char *s)
+static int parse_pcid(const char *s)
 {
     int rc = 0;
 
++++++ 5bdc31d5-VMX-fix-vmx_handle_eoi.patch ++++++
References: bsc#1108940

# Commit 45cb9a4123b5550eb1f84846fe5482acae1c13a3
# Date 2018-11-02 12:15:33 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
VMX: fix vmx_handle_eoi()

In commit 303066fdb1e ("VMX: fix interaction of APIC-V and Viridian
emulation") I screwed up: Instead of clearing SVI, other ISR bits
should be taken into account.

Introduce a new helper set_svi(), split out of vmx_process_isr(), and
use it also from vmx_handle_eoi().

Following the problems in vmx_intr_assist() (see the still present big
block of debugging code there) also warn (once) if EOI'd vector and
original SVI don't match.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Chao Gao <chao....@intel.com>
Acked-by: Kevin Tian <kevin.t...@intel.com>
Acked-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -448,7 +448,7 @@ void vlapic_EOI_set(struct vlapic *vlapi
     vlapic_clear_vector(vector, &vlapic->regs->data[APIC_ISR]);
 
     if ( hvm_funcs.handle_eoi )
-        hvm_funcs.handle_eoi(vector);
+        hvm_funcs.handle_eoi(vector, vlapic_find_highest_isr(vlapic));
 
     vlapic_handle_EOI(vlapic, vector);
 
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1915,17 +1915,14 @@ static int vmx_virtual_intr_delivery_ena
     return cpu_has_vmx_virtual_intr_delivery;
 }
 
-static void vmx_process_isr(int isr, struct vcpu *v)
+static u8 set_svi(int isr)
 {
     unsigned long status;
     u8 old;
-    unsigned int i;
-    const struct vlapic *vlapic = vcpu_vlapic(v);
 
     if ( isr < 0 )
         isr = 0;
 
-    vmx_vmcs_enter(v);
     __vmread(GUEST_INTR_STATUS, &status);
     old = status >> VMX_GUEST_INTR_STATUS_SVI_OFFSET;
     if ( isr != old )
@@ -1935,6 +1932,18 @@ static void vmx_process_isr(int isr, str
         __vmwrite(GUEST_INTR_STATUS, status);
     }
 
+    return old;
+}
+
+static void vmx_process_isr(int isr, struct vcpu *v)
+{
+    unsigned int i;
+    const struct vlapic *vlapic = vcpu_vlapic(v);
+
+    vmx_vmcs_enter(v);
+
+    set_svi(isr);
+
     /*
      * Theoretically, only level triggered interrupts can have their
      * corresponding bits set in the eoi exit bitmap. That is, the bits
@@ -2085,14 +2094,13 @@ static bool vmx_test_pir(const struct vc
     return pi_test_pir(vec, &v->arch.hvm_vmx.pi_desc);
 }
 
-static void vmx_handle_eoi(u8 vector)
+static void vmx_handle_eoi(uint8_t vector, int isr)
 {
-    unsigned long status;
+    uint8_t old_svi = set_svi(isr);
+    static bool warned;
 
-    /* We need to clear the SVI field. */
-    __vmread(GUEST_INTR_STATUS, &status);
-    status &= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
-    __vmwrite(GUEST_INTR_STATUS, status);
+    if ( vector != old_svi && !test_and_set_bool(warned) )
+        printk(XENLOG_WARNING "EOI for %02x but SVI=%02x\n", vector, old_svi);
 }
 
 static void vmx_enable_msr_interception(struct domain *d, uint32_t msr)
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -200,7 +200,7 @@ struct hvm_function_table {
     void (*deliver_posted_intr)(struct vcpu *v, u8 vector);
     void (*sync_pir_to_irr)(struct vcpu *v);
     bool (*test_pir)(const struct vcpu *v, uint8_t vector);
-    void (*handle_eoi)(u8 vector);
+    void (*handle_eoi)(uint8_t vector, int isr);
 
     /*Walk nested p2m  */
     int (*nhvm_hap_walk_L1_p2m)(struct vcpu *v, paddr_t L2_gpa,
++++++ 5be2a308-x86-extend-get_platform_badpages.patch ++++++
# Commit 8617e69fb8307b372eeff41d55ec966dbeba36eb
# Date 2018-11-07 09:32:08 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: extend get_platform_badpages() interface

Use a structure so along with an address (now frame number) an order can
also be specified.

This is part of XSA-282.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/guest/xen.c
+++ b/xen/arch/x86/guest/xen.c
@@ -40,7 +40,7 @@ bool __read_mostly xen_guest;
 static __read_mostly uint32_t xen_cpuid_base;
 extern char hypercall_page[];
 static struct rangeset *mem;
-static unsigned long __initdata reserved_pages[2];
+static struct platform_bad_page __initdata reserved_pages[2];
 
 DEFINE_PER_CPU(unsigned int, vcpu_id);
 
@@ -326,7 +326,7 @@ void __init hypervisor_fixup_e820(struct
         panic("Unable to get " #p);             \
     mark_pfn_as_ram(e820, pfn);                 \
     ASSERT(i < ARRAY_SIZE(reserved_pages));     \
-    reserved_pages[i++] = pfn << PAGE_SHIFT;    \
+    reserved_pages[i++].mfn = pfn;              \
 })
     MARK_PARAM_RAM(HVM_PARAM_STORE_PFN);
     if ( !pv_console )
@@ -334,7 +334,7 @@ void __init hypervisor_fixup_e820(struct
 #undef MARK_PARAM_RAM
 }
 
-const unsigned long *__init hypervisor_reserved_pages(unsigned int *size)
+const struct platform_bad_page *__init hypervisor_reserved_pages(unsigned int 
*size)
 {
     ASSERT(xen_guest);
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5769,23 +5769,23 @@ void arch_dump_shared_mem_info(void)
             mem_sharing_get_nr_saved_mfns());
 }
 
-const unsigned long *__init get_platform_badpages(unsigned int *array_size)
+const struct platform_bad_page *__init get_platform_badpages(unsigned int 
*array_size)
 {
     u32 igd_id;
-    static unsigned long __initdata bad_pages[] = {
-        0x20050000,
-        0x20110000,
-        0x20130000,
-        0x20138000,
-        0x40004000,
+    static const struct platform_bad_page __initconst snb_bad_pages[] = {
+        { .mfn = 0x20050000 >> PAGE_SHIFT },
+        { .mfn = 0x20110000 >> PAGE_SHIFT },
+        { .mfn = 0x20130000 >> PAGE_SHIFT },
+        { .mfn = 0x20138000 >> PAGE_SHIFT },
+        { .mfn = 0x40004000 >> PAGE_SHIFT },
     };
 
-    *array_size = ARRAY_SIZE(bad_pages);
+    *array_size = ARRAY_SIZE(snb_bad_pages);
     igd_id = pci_conf_read32(0, 0, 2, 0, 0);
-    if ( !IS_SNB_GFX(igd_id) )
-        return NULL;
+    if ( IS_SNB_GFX(igd_id) )
+        return snb_bad_pages;
 
-    return bad_pages;
+    return NULL;
 }
 
 void paging_invlpg(struct vcpu *v, unsigned long va)
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -270,7 +270,7 @@ void __init init_boot_pages(paddr_t ps,
     unsigned long bad_spfn, bad_epfn;
     const char *p;
 #ifdef CONFIG_X86
-    const unsigned long *badpage = NULL;
+    const struct platform_bad_page *badpage;
     unsigned int i, array_size;
 
     BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
@@ -299,8 +299,8 @@ void __init init_boot_pages(paddr_t ps,
     {
         for ( i = 0; i < array_size; i++ )
         {
-            bootmem_region_zap(*badpage >> PAGE_SHIFT,
-                               (*badpage >> PAGE_SHIFT) + 1);
+            bootmem_region_zap(badpage->mfn,
+                               badpage->mfn + (1U << badpage->order));
             badpage++;
         }
     }
@@ -312,8 +312,8 @@ void __init init_boot_pages(paddr_t ps,
         {
             for ( i = 0; i < array_size; i++ )
             {
-                bootmem_region_zap(*badpage >> PAGE_SHIFT,
-                                   (*badpage >> PAGE_SHIFT) + 1);
+                bootmem_region_zap(badpage->mfn,
+                                   badpage->mfn + (1U << badpage->order));
                 badpage++;
             }
         }
--- a/xen/include/asm-x86/guest/xen.h
+++ b/xen/include/asm-x86/guest/xen.h
@@ -37,7 +37,7 @@ void hypervisor_ap_setup(void);
 int hypervisor_alloc_unused_page(mfn_t *mfn);
 int hypervisor_free_unused_page(mfn_t mfn);
 void hypervisor_fixup_e820(struct e820map *e820);
-const unsigned long *hypervisor_reserved_pages(unsigned int *size);
+const struct platform_bad_page *hypervisor_reserved_pages(unsigned int *size);
 uint32_t hypervisor_cpuid_base(void);
 void hypervisor_resume(void);
 
@@ -65,7 +65,7 @@ static inline void hypervisor_fixup_e820
     ASSERT_UNREACHABLE();
 }
 
-static inline const unsigned long *hypervisor_reserved_pages(unsigned int 
*size)
+static inline const struct platform_bad_page 
*hypervisor_reserved_pages(unsigned int *size)
 {
     ASSERT_UNREACHABLE();
     return NULL;
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -341,7 +341,13 @@ void zap_ro_mpt(mfn_t mfn);
 
 bool is_iomem_page(mfn_t mfn);
 
-const unsigned long *get_platform_badpages(unsigned int *array_size);
+struct platform_bad_page {
+    unsigned long mfn;
+    unsigned int order;
+};
+
+const struct platform_bad_page *get_platform_badpages(unsigned int 
*array_size);
+
 /* Per page locks:
  * page_lock() is used for two purposes: pte serialization, and memory sharing.
  *
++++++ 5be2a354-x86-work-around-HLE-host-lockup-erratum.patch ++++++
# Commit cc76410d20aff2cc07b268b0713dc1d2740c6e12
# Date 2018-11-07 09:33:24 +0100
# Author Jan Beulich <jbeul...@suse.com>
# Committer Jan Beulich <jbeul...@suse.com>
x86: work around HLE host lockup erratum

XACQUIRE prefixed accesses to the 4Mb range of memory starting at 1Gb
are liable to lock up the processor. Disallow use of this memory range.

Unfortunately the available Core Gen7 and Gen8 spec updates are pretty
old, so I can only guess that they're similarly affected when Core Gen6
is and the Xeon counterparts are, too.

This is part of XSA-282.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5779,6 +5779,22 @@ const struct platform_bad_page *__init g
         { .mfn = 0x20138000 >> PAGE_SHIFT },
         { .mfn = 0x40004000 >> PAGE_SHIFT },
     };
+    static const struct platform_bad_page __initconst hle_bad_page = {
+        .mfn = 0x40000000 >> PAGE_SHIFT, .order = 10
+    };
+
+    switch ( cpuid_eax(1) & 0x000f3ff0 )
+    {
+    case 0x000406e0: /* erratum SKL167 */
+    case 0x00050650: /* erratum SKZ63 */
+    case 0x000506e0: /* errata SKL167 / SKW159 */
+    case 0x000806e0: /* erratum KBL??? */
+    case 0x000906e0: /* errata KBL??? / KBW114 / CFW103 */
+        *array_size = (cpuid_eax(0) >= 7 &&
+                       !(cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_HYPERVISOR)) 
&&
+                       (cpuid_count_ebx(7, 0) & 
cpufeat_mask(X86_FEATURE_HLE)));
+        return &hle_bad_page;
+    }
 
     *array_size = ARRAY_SIZE(snb_bad_pages);
     igd_id = pci_conf_read32(0, 0, 2, 0, 0);
++++++ reproducible.patch ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:32.276031629 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:32.276031629 +0100
@@ -11,10 +11,10 @@
     
     Signed-off-by: Bernhard M. Wiedemann <bwiedem...@suse.de>
 
-diff --git a/Config.mk b/Config.mk
-index 9b13e75a3e..46b064bcae 100644
---- a/Config.mk
-+++ b/Config.mk
+Index: xen-4.11.0-testing/Config.mk
+===================================================================
+--- xen-4.11.0-testing.orig/Config.mk
++++ xen-4.11.0-testing/Config.mk
 @@ -151,6 +151,14 @@ export XEN_HAS_BUILD_ID=y
  build_id_linker := --build-id=sha1
  endif
@@ -30,11 +30,11 @@
  ifndef XEN_HAS_CHECKPOLICY
      CHECKPOLICY ?= checkpolicy
      XEN_HAS_CHECKPOLICY := $(shell $(CHECKPOLICY) -h 2>&1 | grep -q xen && 
echo y || echo n)
-diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
-index 162b0b94c0..866125a8ac 100644
---- a/xen/arch/x86/Makefile
-+++ b/xen/arch/x86/Makefile
-@@ -158,6 +158,7 @@ note.o: $(TARGET)-syms
+Index: xen-4.11.0-testing/xen/arch/x86/Makefile
+===================================================================
+--- xen-4.11.0-testing.orig/xen/arch/x86/Makefile
++++ xen-4.11.0-testing/xen/arch/x86/Makefile
+@@ -157,6 +157,7 @@ note.o: $(TARGET)-syms
  
  EFI_LDFLAGS = $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10
  EFI_LDFLAGS += --image-base=$(1) --stack=0,0 --heap=0,0 --strip-debug

++++++ tmp_build.patch ++++++
--- /var/tmp/diff_new_pack.1BrBU4/_old  2018-12-03 10:04:32.312031596 +0100
+++ /var/tmp/diff_new_pack.1BrBU4/_new  2018-12-03 10:04:32.312031596 +0100
@@ -22,7 +22,7 @@
  
  xenstore: xenstore_client.o $(LIBXENSTORE)
        $(CC) $< $(LDFLAGS) $(LDLIBS_libxenstore) $(LDLIBS_libxentoolcore) 
$(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
-+      $(CC) $< $(CFLAGS) $(LDFLAGS) -Wl,--build-id=sha1 -L. -lxenstore 
$(LDLIBS_libxentoolcore) $(SOCKET_LIBS) -o domu-$@
++      $(CC) $< $(CFLAGS) $(LDFLAGS) -Wl,--build-id=uuid -L. -lxenstore 
$(LDLIBS_libxentoolcore) $(SOCKET_LIBS) -o domu-$@
  
  xenstore-control: xenstore_control.o $(LIBXENSTORE)
        $(CC) $< $(LDFLAGS) $(LDLIBS_libxenstore) $(LDLIBS_libxentoolcore) 
$(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)

++++++ xsa275-1.patch ++++++
amd/iommu: fix flush checks

Flush checking for AMD IOMMU didn't check whether the previous entry
was present, or whether the flags (writable/readable) changed in order
to decide whether a flush should be executed.

Fix this by taking the writable/readable/next-level fields into account,
together with the present bit.

Along these lines the flushing in amd_iommu_map_page() must not be
omitted for PV domains. The comment there was simply wrong: Mappings may
very well change, both their addresses and their permissions. Ultimately
this should honor iommu_dont_flush_iotlb, but to achieve this
amd_iommu_ops first needs to gain an .iotlb_flush hook.

Also make clear_iommu_pte_present() static, to demonstrate there's no
caller omitting the (subsequent) flush.

This is part of XSA-275.

Signed-off-by: Roger Pau Monné <roger....@citrix.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -35,7 +35,7 @@ static unsigned int pfn_to_pde_idx(unsig
     return idx;
 }
 
-void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn)
+static void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn)
 {
     u64 *table, *pte;
 
@@ -49,23 +49,42 @@ static bool_t set_iommu_pde_present(u32
                                     unsigned int next_level,
                                     bool_t iw, bool_t ir)
 {
-    u64 addr_lo, addr_hi, maddr_old, maddr_next;
+    uint64_t addr_lo, addr_hi, maddr_next;
     u32 entry;
-    bool_t need_flush = 0;
+    bool need_flush = false, old_present;
 
     maddr_next = (u64)next_mfn << PAGE_SHIFT;
 
-    addr_hi = get_field_from_reg_u32(pde[1],
-                                     IOMMU_PTE_ADDR_HIGH_MASK,
-                                     IOMMU_PTE_ADDR_HIGH_SHIFT);
-    addr_lo = get_field_from_reg_u32(pde[0],
-                                     IOMMU_PTE_ADDR_LOW_MASK,
-                                     IOMMU_PTE_ADDR_LOW_SHIFT);
-
-    maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
-
-    if ( maddr_old != maddr_next )
-        need_flush = 1;
+    old_present = get_field_from_reg_u32(pde[0], IOMMU_PTE_PRESENT_MASK,
+                                         IOMMU_PTE_PRESENT_SHIFT);
+    if ( old_present )
+    {
+        bool old_r, old_w;
+        unsigned int old_level;
+        uint64_t maddr_old;
+
+        addr_hi = get_field_from_reg_u32(pde[1],
+                                         IOMMU_PTE_ADDR_HIGH_MASK,
+                                         IOMMU_PTE_ADDR_HIGH_SHIFT);
+        addr_lo = get_field_from_reg_u32(pde[0],
+                                         IOMMU_PTE_ADDR_LOW_MASK,
+                                         IOMMU_PTE_ADDR_LOW_SHIFT);
+        old_level = get_field_from_reg_u32(pde[0],
+                                           IOMMU_PDE_NEXT_LEVEL_MASK,
+                                           IOMMU_PDE_NEXT_LEVEL_SHIFT);
+        old_w = get_field_from_reg_u32(pde[1],
+                                       IOMMU_PTE_IO_WRITE_PERMISSION_MASK,
+                                       IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT);
+        old_r = get_field_from_reg_u32(pde[1],
+                                       IOMMU_PTE_IO_READ_PERMISSION_MASK,
+                                       IOMMU_PTE_IO_READ_PERMISSION_SHIFT);
+
+        maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
+
+        if ( maddr_old != maddr_next || iw != old_w || ir != old_r ||
+             old_level != next_level )
+            need_flush = true;
+    }
 
     addr_lo = maddr_next & DMA_32BIT_MASK;
     addr_hi = maddr_next >> 32;
@@ -687,10 +706,7 @@ int amd_iommu_map_page(struct domain *d,
     if ( !need_flush )
         goto out;
 
-    /* 4K mapping for PV guests never changes, 
-     * no need to flush if we trust non-present bits */
-    if ( is_hvm_domain(d) )
-        amd_iommu_flush_pages(d, gfn, 0);
+    amd_iommu_flush_pages(d, gfn, 0);
 
     for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2;
           merge_level <= hd->arch.paging_mode; merge_level++ )
++++++ xsa275-2.patch ++++++
AMD/IOMMU: suppress PTE merging after initial table creation

The logic is not fit for this purpose, so simply disable its use until
it can be fixed / replaced. Note that this re-enables merging for the
table creation case, which was disabled as a (perhaps unintended) side
effect of the earlier "amd/iommu: fix flush checks". It relies on no
page getting mapped more than once (with different properties) in this
process, as that would still be beyond what the merging logic can cope
with. But arch_iommu_populate_page_table() guarantees this afaict.

This is part of XSA-275.

Signed-off-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -702,11 +702,24 @@ int amd_iommu_map_page(struct domain *d,
                                        !!(flags & IOMMUF_writable),
                                        !!(flags & IOMMUF_readable));
 
-    /* Do not increase pde count if io mapping has not been changed */
-    if ( !need_flush )
-        goto out;
+    if ( need_flush )
+    {
+        amd_iommu_flush_pages(d, gfn, 0);
+        /* No further merging, as the logic doesn't cope. */
+        hd->arch.no_merge = true;
+    }
 
-    amd_iommu_flush_pages(d, gfn, 0);
+    /*
+     * Suppress merging of non-R/W mappings or after initial table creation,
+     * as the merge logic does not cope with this.
+     */
+    if ( hd->arch.no_merge || flags != (IOMMUF_writable | IOMMUF_readable) )
+        goto out;
+    if ( d->creation_finished )
+    {
+        hd->arch.no_merge = true;
+        goto out;
+    }
 
     for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2;
           merge_level <= hd->arch.paging_mode; merge_level++ )
@@ -780,6 +793,10 @@ int amd_iommu_unmap_page(struct domain *
 
     /* mark PTE as 'page not present' */
     clear_iommu_pte_present(pt_mfn[1], gfn);
+
+    /* No further merging in amd_iommu_map_page(), as the logic doesn't cope. 
*/
+    hd->arch.no_merge = true;
+
     spin_unlock(&hd->arch.mapping_lock);
 
     amd_iommu_flush_pages(d, gfn, 0);
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -40,6 +40,7 @@ struct arch_iommu
 
     /* amd iommu support */
     int paging_mode;
+    bool no_merge;
     struct page_info *root_table;
     struct guest_iommu *g_iommu;
 };
++++++ xsa276-1.patch ++++++
x86/hvm/ioreq: fix page referencing

The code does not take a page reference in hvm_alloc_ioreq_mfn(), only a
type reference. This can lead to a situation where a malicious domain with
XSM_DM_PRIV can engineer a sequence as follows:

- create IOREQ server: no pages as yet.
- acquire resource: page allocated, total 0.
- decrease reservation: -1 ref, total -1.

This will cause Xen to hit a BUG_ON() in free_domheap_pages().

This patch fixes the issue by changing the call to get_page_type() in
hvm_alloc_ioreq_mfn() to a call to get_page_and_type(). This change
in turn requires an extra put_page() in hvm_free_ioreq_mfn() in the case
that _PGC_allocated is still set (i.e. a decrease reservation has not
occurred) to avoid the page being leaked.

This is part of XSA-276.

Signed-off-by: Paul Durrant <paul.durr...@citrix.com>
Signed-off-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -327,6 +327,7 @@ static int hvm_map_ioreq_gfn(struct hvm_
 static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
 {
     struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *page;
 
     if ( iorp->page )
     {
@@ -349,27 +350,33 @@ static int hvm_alloc_ioreq_mfn(struct hv
      * could fail if the emulating domain has already reached its
      * maximum allocation.
      */
-    iorp->page = alloc_domheap_page(s->emulator, MEMF_no_refcount);
+    page = alloc_domheap_page(s->emulator, MEMF_no_refcount);
 
-    if ( !iorp->page )
+    if ( !page )
         return -ENOMEM;
 
-    if ( !get_page_type(iorp->page, PGT_writable_page) )
-        goto fail1;
+    if ( !get_page_and_type(page, s->emulator, PGT_writable_page) )
+    {
+        /*
+         * The domain can't possibly know about this page yet, so failure
+         * here is a clear indication of something fishy going on.
+         */
+        domain_crash(s->emulator);
+        return -ENODATA;
+    }
 
-    iorp->va = __map_domain_page_global(iorp->page);
+    iorp->va = __map_domain_page_global(page);
     if ( !iorp->va )
-        goto fail2;
+        goto fail;
 
+    iorp->page = page;
     clear_page(iorp->va);
     return 0;
 
- fail2:
-    put_page_type(iorp->page);
-
- fail1:
-    put_page(iorp->page);
-    iorp->page = NULL;
+ fail:
+    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+        put_page(page);
+    put_page_and_type(page);
 
     return -ENOMEM;
 }
@@ -377,15 +384,24 @@ static int hvm_alloc_ioreq_mfn(struct hv
 static void hvm_free_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
 {
     struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+    struct page_info *page = iorp->page;
 
-    if ( !iorp->page )
+    if ( !page )
         return;
 
+    iorp->page = NULL;
+
     unmap_domain_page_global(iorp->va);
     iorp->va = NULL;
 
-    put_page_and_type(iorp->page);
-    iorp->page = NULL;
+    /*
+     * Check whether we need to clear the allocation reference before
+     * dropping the explicit references taken by get_page_and_type().
+     */
+    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+        put_page(page);
+
+    put_page_and_type(page);
 }
 
 bool is_ioreq_server_page(struct domain *d, const struct page_info *page)
++++++ xsa276-2.patch ++++++
x86/hvm/ioreq: use ref-counted target-assigned shared

Passing MEMF_no_refcount to alloc_domheap_pages() will allocate, as
expected, a page that is assigned to the specified domain but is not
accounted for in tot_pages. Unfortunately there is no logic for tracking
such allocations and avoiding any adjustment to tot_pages when the page
is freed.

The only caller of alloc_domheap_pages() that passes MEMF_no_refcount is
hvm_alloc_ioreq_mfn() so this patch removes use of the flag from that
call-site to avoid the possibility of a domain using an ioreq server as
a means to adjust its tot_pages and hence allocate more memory than it
should be able to.

However, the reason for using the flag in the first place was to avoid
the allocation failing if the emulator domain is already at its maximum
memory limit. Hence this patch switches to allocating memory from the
target domain instead of the emulator domain. There is already an extra
memory allowance of 2MB (LIBXL_HVM_EXTRA_MEMORY) applied to HVM guests,
which is sufficient to cover the pages required by the supported
configuration of a single IOREQ server for QEMU. (Stub-domains do not,
so far, use resource mapping). It also also the case the QEMU will have
mapped the IOREQ server pages before the guest boots, hence it is not
possible for the guest to inflate its balloon to consume these pages.

Signed-off-by: Paul Durrant <paul.durr...@citrix.com>

--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -342,20 +342,12 @@ static int hvm_alloc_ioreq_mfn(struct hv
         return 0;
     }
 
-    /*
-     * Allocated IOREQ server pages are assigned to the emulating
-     * domain, not the target domain. This is safe because the emulating
-     * domain cannot be destroyed until the ioreq server is destroyed.
-     * Also we must use MEMF_no_refcount otherwise page allocation
-     * could fail if the emulating domain has already reached its
-     * maximum allocation.
-     */
-    page = alloc_domheap_page(s->emulator, MEMF_no_refcount);
+    page = alloc_domheap_page(s->target, 0);
 
     if ( !page )
         return -ENOMEM;
 
-    if ( !get_page_and_type(page, s->emulator, PGT_writable_page) )
+    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
     {
         /*
          * The domain can't possibly know about this page yet, so failure
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4396,12 +4396,6 @@ int arch_acquire_resource(struct domain
 
             mfn_list[i] = mfn_x(mfn);
         }
-
-        /*
-         * The frames will have been assigned to the domain that created
-         * the ioreq server.
-         */
-        *flags |= XENMEM_rsrc_acq_caller_owned;
         break;
     }
 
++++++ xsa277.patch ++++++
x86/mm: Put the gfn on all paths after get_gfn_query()

get_gfn_query() internally takes the p2m lock, and must be matched with a
put_gfn() call later.

This is XSA-277.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -696,7 +696,7 @@ int arch_domain_soft_reset(struct domain
         printk(XENLOG_G_ERR "Failed to get Dom%d's shared_info GFN (%lx)\n",
                d->domain_id, gfn);
         ret = -EINVAL;
-        goto exit_put_page;
+        goto exit_put_gfn;
     }
 
     new_page = alloc_domheap_page(d, 0);
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -303,7 +303,11 @@ int guest_remove_page(struct domain *d,
 #ifdef CONFIG_X86
     mfn = get_gfn_query(d, gmfn, &p2mt);
     if ( unlikely(p2mt == p2m_invalid) || unlikely(p2mt == p2m_mmio_dm) )
+    {
+        put_gfn(d, gmfn);
+
         return -ENOENT;
+    }
 
     if ( unlikely(p2m_is_paging(p2mt)) )
     {
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -432,10 +432,7 @@ static inline mfn_t __nonnull(3) get_gfn
     return get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, q, NULL);
 }
 
-/* Syntactic sugar: most callers will use one of these. 
- * N.B. get_gfn_query() is the _only_ one guaranteed not to take the
- * p2m lock; none of the others can be called with the p2m or paging
- * lock held. */
+/* Syntactic sugar: most callers will use one of these. */
 #define get_gfn(d, g, t)         get_gfn_type((d), (g), (t), P2M_ALLOC)
 #define get_gfn_query(d, g, t)   get_gfn_type((d), (g), (t), 0)
 #define get_gfn_unshare(d, g, t) get_gfn_type((d), (g), (t), \
++++++ xsa279.patch ++++++
x86/mm: Don't perform flush after failing to update a guests L1e

If the L1e update hasn't occured, the flush cannot do anything useful.  This
skips the potentially expensive vcpumask_to_pcpumask() conversion, and
broadcast TLB shootdown.

More importantly however, we might be in the error path due to a bad va
parameter from the guest, and this should not propagate into the TLB flushing
logic.  The INVPCID instruction for example raises #GP for a non-canonical
address.

This is XSA-279.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4089,6 +4089,14 @@ static int __do_update_va_mapping(
     if ( pl1e )
         unmap_domain_page(pl1e);
 
+    /*
+     * Any error at this point means that we haven't change the l1e.  Skip the
+     * flush, as it won't do anything useful.  Furthermore, va is guest
+     * controlled and not necesserily audited by this point.
+     */
+    if ( rc )
+        return rc;
+
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
     case UVMF_TLB_FLUSH:
++++++ xsa280-1.patch ++++++
x86/shadow: move OOS flag bit positions

In preparation of reducing struct page_info's shadow_flags field to 16
bits, lower the bit positions used for SHF_out_of_sync and
SHF_oos_may_write.

Instead of also adjusting the open coded use in _get_page_type(),
introduce shadow_prepare_page_type_change() to contain knowledge of the
bit positions to shadow code.

This is part of XSA-280.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Tim Deegan <t...@xen.org>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2654,17 +2654,8 @@ static int _get_page_type(struct page_in
         {
             struct domain *d = page_get_owner(page);
 
-            /*
-             * Normally we should never let a page go from type count 0
-             * to type count 1 when it is shadowed. One exception:
-             * out-of-sync shadowed pages are allowed to become
-             * writeable.
-             */
-            if ( d && shadow_mode_enabled(d)
-                 && (page->count_info & PGC_page_table)
-                 && !((page->shadow_flags & (1u<<29))
-                      && type == PGT_writable_page) )
-               shadow_remove_all_shadows(d, page_to_mfn(page));
+            if ( d && shadow_mode_enabled(d) )
+               shadow_prepare_page_type_change(d, page, type);
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -990,6 +990,9 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn
          || !v->domain->arch.paging.shadow.oos_active )
         return 0;
 
+    BUILD_BUG_ON(!(typeof(pg->shadow_flags))SHF_out_of_sync);
+    BUILD_BUG_ON(!(typeof(pg->shadow_flags))SHF_oos_may_write);
+
     pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
     oos_hash_add(v, gmfn);
     perfc_incr(shadow_unsync);
@@ -2930,6 +2933,26 @@ void sh_remove_shadows(struct domain *d,
     paging_unlock(d);
 }
 
+void shadow_prepare_page_type_change(struct domain *d, struct page_info *page,
+                                     unsigned long new_type)
+{
+    if ( !(page->count_info & PGC_page_table) )
+        return;
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /*
+     * Normally we should never let a page go from type count 0 to type
+     * count 1 when it is shadowed. One exception: out-of-sync shadowed
+     * pages are allowed to become writeable.
+     */
+    if ( (page->shadow_flags & SHF_oos_may_write) &&
+         new_type == PGT_writable_page )
+        return;
+#endif
+
+    shadow_remove_all_shadows(d, page_to_mfn(page));
+}
+
 static void
 sh_remove_all_shadows_and_parents(struct domain *d, mfn_t gmfn)
 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -287,8 +287,8 @@ static inline void sh_terminate_list(str
  * codepath is called during that time and is sensitive to oos issues, it may
  * need to use the second flag.
  */
-#define SHF_out_of_sync (1u<<30)
-#define SHF_oos_may_write (1u<<29)
+#define SHF_out_of_sync (1u << (SH_type_max_shadow + 1))
+#define SHF_oos_may_write (1u << (SH_type_max_shadow + 2))
 
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -81,6 +81,10 @@ void shadow_final_teardown(struct domain
 
 void sh_remove_shadows(struct domain *d, mfn_t gmfn, int fast, int all);
 
+/* Adjust shadows ready for a guest page to change its type. */
+void shadow_prepare_page_type_change(struct domain *d, struct page_info *page,
+                                     unsigned long new_type);
+
 /* Discard _all_ mappings from the domain's shadows. */
 void shadow_blow_tables_per_domain(struct domain *d);
 
@@ -105,6 +109,10 @@ int shadow_set_allocation(struct domain
 static inline void sh_remove_shadows(struct domain *d, mfn_t gmfn,
                                      int fast, int all) {}
 
+static inline void shadow_prepare_page_type_change(struct domain *d,
+                                                   struct page_info *page,
+                                                   unsigned long new_type) {}
+
 static inline void shadow_blow_tables_per_domain(struct domain *d) {}
 
 static inline int shadow_domctl(struct domain *d,
++++++ xsa280-2.patch ++++++
x86/shadow: shrink struct page_info's shadow_flags to 16 bits

This is to avoid it overlapping the linear_pt_count field needed for PV
domains. Introduce a separate, HVM-only pagetable_dying field to replace
the sole one left in the upper 16 bits.

Note that the accesses to ->shadow_flags in shadow_{pro,de}mote() get
switched to non-atomic, non-bitops operations, as {test,set,clear}_bit()
are not allowed on uint16_t fields and hence their use would have
required ugly casts. This is fine because all updates of the field ought
to occur with the paging lock held, and other updates of it use |= and
&= as well (i.e. using atomic operations here didn't really guard
against potentially racing updates elsewhere).

This is part of XSA-280.

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Reviewed-by: Tim Deegan <t...@xen.org>

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1028,10 +1028,14 @@ void shadow_promote(struct domain *d, mf
 
     /* Is the page already shadowed? */
     if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+    {
         page->shadow_flags = 0;
+        if ( is_hvm_domain(d) )
+            page->pagetable_dying = false;
+    }
 
-    ASSERT(!test_bit(type, &page->shadow_flags));
-    set_bit(type, &page->shadow_flags);
+    ASSERT(!(page->shadow_flags & (1u << type)));
+    page->shadow_flags |= 1u << type;
     TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
 }
 
@@ -1040,9 +1044,9 @@ void shadow_demote(struct domain *d, mfn
     struct page_info *page = mfn_to_page(gmfn);
 
     ASSERT(test_bit(_PGC_page_table, &page->count_info));
-    ASSERT(test_bit(type, &page->shadow_flags));
+    ASSERT(page->shadow_flags & (1u << type));
 
-    clear_bit(type, &page->shadow_flags);
+    page->shadow_flags &= ~(1u << type);
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
     {
@@ -2921,7 +2925,7 @@ void sh_remove_shadows(struct domain *d,
     if ( !fast && all && (pg->count_info & PGC_page_table) )
     {
         SHADOW_ERROR("can't find all shadows of mfn %"PRI_mfn" "
-                     "(shadow_flags=%08x)\n",
+                     "(shadow_flags=%04x)\n",
                       mfn_x(gmfn), pg->shadow_flags);
         domain_crash(d);
     }
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3299,8 +3299,8 @@ static int sh_page_fault(struct vcpu *v,
 
     /* Unshadow if we are writing to a toplevel pagetable that is
      * flagged as a dying process, and that is not currently used. */
-    if ( sh_mfn_is_a_page_table(gmfn)
-         && (mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying) )
+    if ( sh_mfn_is_a_page_table(gmfn) && is_hvm_domain(d) &&
+         mfn_to_page(gmfn)->pagetable_dying )
     {
         int used = 0;
         struct vcpu *tmp;
@@ -4254,9 +4254,9 @@ int sh_rm_write_access_from_sl1p(struct
     ASSERT(mfn_valid(smfn));
 
     /* Remember if we've been told that this process is being torn down */
-    if ( curr->domain == d )
+    if ( curr->domain == d && is_hvm_domain(d) )
         curr->arch.paging.shadow.pagetable_dying
-            = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying);
+            = mfn_to_page(gmfn)->pagetable_dying;
 
     sp = mfn_to_page(smfn);
 
@@ -4572,10 +4572,10 @@ static void sh_pagetable_dying(struct vc
                    : shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l2_pae_shadow);
         }
 
-        if ( mfn_valid(smfn) )
+        if ( mfn_valid(smfn) && is_hvm_domain(d) )
         {
             gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
-            mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+            mfn_to_page(gmfn)->pagetable_dying = true;
             shadow_unhook_mappings(d, smfn, 1/* user pages only */);
             flush = 1;
         }
@@ -4612,9 +4612,9 @@ static void sh_pagetable_dying(struct vc
     smfn = shadow_hash_lookup(d, mfn_x(gmfn), SH_type_l4_64_shadow);
 #endif
 
-    if ( mfn_valid(smfn) )
+    if ( mfn_valid(smfn) && is_hvm_domain(d) )
     {
-        mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
+        mfn_to_page(gmfn)->pagetable_dying = true;
         shadow_unhook_mappings(d, smfn, 1/* user pages only */);
         /* Now flush the TLB: we removed toplevel mappings. */
         flush_tlb_mask(d->dirty_cpumask);
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -292,8 +292,6 @@ static inline void sh_terminate_list(str
 
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
-#define SHF_pagetable_dying (1u<<31)
-
 static inline int sh_page_has_multiple_shadows(struct page_info *pg)
 {
     u32 shadows;
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -259,8 +259,15 @@ struct page_info
          * Guest pages with a shadow.  This does not conflict with
          * tlbflush_timestamp since page table pages are explicitly not
          * tracked for TLB-flush avoidance when a guest runs in shadow mode.
+         *
+         * pagetable_dying is used for HVM domains only. The layout here has
+         * to avoid re-use of the space used by linear_pt_count, which (only)
+         * PV guests use.
          */
-        u32 shadow_flags;
+        struct {
+            uint16_t shadow_flags;
+            bool pagetable_dying;
+        };
 
         /* When in use as a shadow, next shadow in this hash chain. */
         __pdx_t next_shadow;

commit xen for openSUSE:Factory

Reply via email to