Hello community, here is the log from the commit of package xen for openSUSE:Factory checked in at 2020-02-25 16:01:56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/xen (Old) and /work/SRC/openSUSE:Factory/.xen.new.26092 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "xen" Tue Feb 25 16:01:56 2020 rev:280 rq:777747 version:4.13.0_08 Changes: -------- --- /work/SRC/openSUSE:Factory/xen/xen.changes 2020-02-03 11:11:48.981799384 +0100 +++ /work/SRC/openSUSE:Factory/.xen.new.26092/xen.changes 2020-02-25 16:03:08.124206851 +0100 @@ -1,0 +2,48 @@ +Thu Feb 20 08:18:37 MST 2020 - carn...@suse.com + +- bsc#1160932 - VUL-0: xen: XSA-312 v1: arm: a CPU may speculate + past the ERET instruction + 5e1dcedd-Arm-place-speculation-barrier-after-ERET.patch +- bsc#1164425 - x86: "spec-ctrl=no-xen" should also disable branch + hardening + 5e4e614d-x86-spec-ctrl-no-xen-also-disables-branch-hardening.patch +- Upstream bug fixes (bsc#1027519) + 5e21ce98-x86-time-update-TSC-stamp-after-deep-C-state.patch + 5e286cce-VT-d-dont-pass-bridges-to-domain_context_mapping_one.patch + 5e318cd4-x86-apic-fix-disabling-LVT0.patch + 5e344c11-x86-HVM-relinquish-resources-from-domain_destroy.patch + 5e3bd385-EFI-recheck-variable-name-strings.patch + 5e3bd3d1-EFI-dont-leak-heap-VIA-XEN_EFI_get_next_variable_name.patch + 5e3bd3f8-xmalloc-guard-against-overflow.patch + 5e46e090-x86-smp-reset-x2apic_enabled-in-smp_send_stop.patch + 5e4c00ef-VT-d-check-full-RMRR-for-E820-reserved.patch + 5e4d4f5b-sched-fix-get_cpu_idle_time-with-core-sched.patch + +------------------------------------------------------------------- +Tue Feb 18 18:18:18 UTC 2020 - oher...@suse.de + +- bsc#1159755 - use fixed qemu-3.1 machine type for HVM + This must be done in qemu to preserve PCI layout + remove libxl.lock-qemu-machine-for-hvm.patch + +------------------------------------------------------------------- +Fri Feb 7 12:37:35 UTC 2020 - oher...@suse.de + +- jsc#SLE-10183 - script to calculate cpuid= mask + add helper script from https://github.com/twizted/xen_maskcalc + domUs may be migrated between different cpus from the same vendor + if their visible cpuid value has incompatible feature bits masked. + +------------------------------------------------------------------- +Wed Feb 5 15:16:06 UTC 2020 - oher...@suse.de + +- jsc#SLE-10172, bsc#1055731 - handle degraded raid for xendomains + add helper script and systemd service from + https://github.com/luizluca/xen-tools-xendomains-wait-disk + in new sub package xen-tools-xendomains-wait-disk + See included README for usage instructions + xendomains-wait-disks.LICENSE + xendomains-wait-disks.README.md + xendomains-wait-disks.sh + +------------------------------------------------------------------- Old: ---- libxl.lock-qemu-machine-for-hvm.patch New: ---- 5e1dcedd-Arm-place-speculation-barrier-after-ERET.patch 5e21ce98-x86-time-update-TSC-stamp-after-deep-C-state.patch 5e286cce-VT-d-dont-pass-bridges-to-domain_context_mapping_one.patch 5e318cd4-x86-apic-fix-disabling-LVT0.patch 5e344c11-x86-HVM-relinquish-resources-from-domain_destroy.patch 5e3bd385-EFI-recheck-variable-name-strings.patch 5e3bd3d1-EFI-dont-leak-heap-VIA-XEN_EFI_get_next_variable_name.patch 5e3bd3f8-xmalloc-guard-against-overflow.patch 5e46e090-x86-smp-reset-x2apic_enabled-in-smp_send_stop.patch 5e4c00ef-VT-d-check-full-RMRR-for-E820-reserved.patch 5e4d4f5b-sched-fix-get_cpu_idle_time-with-core-sched.patch 5e4e614d-x86-spec-ctrl-no-xen-also-disables-branch-hardening.patch xen_maskcalc.py xendomains-wait-disks.LICENSE xendomains-wait-disks.README.md xendomains-wait-disks.sh ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ xen.spec ++++++ --- /var/tmp/diff_new_pack.5nXBtx/_old 2020-02-25 16:03:19.156228943 +0100 +++ /var/tmp/diff_new_pack.5nXBtx/_new 2020-02-25 16:03:19.160228951 +0100 @@ -127,7 +127,7 @@ BuildRequires: pesign-obs-integration %endif -Version: 4.13.0_06 +Version: 4.13.0_08 Release: 0 Summary: Xen Virtualization: Hypervisor (aka VMM aka Microkernel) License: GPL-2.0-only @@ -159,6 +159,10 @@ Source41: xencommons.service Source42: xen-dom0-modules.service Source57: xen-utils-0.1.tar.bz2 +Source10172: xendomains-wait-disks.sh +Source10173: xendomains-wait-disks.LICENSE +Source10174: xendomains-wait-disks.README.md +Source10183: xen_maskcalc.py # For xen-libs Source99: baselibs.conf # Upstream patches @@ -167,6 +171,18 @@ Patch3: 5e15e03d-sched-fix-S3-resume-with-smt=0.patch Patch4: 5e16fb6a-x86-clear-per-cpu-stub-page-info.patch Patch5: 5e1da013-IRQ-u16-is-too-narrow-for-evtchn.patch +Patch6: 5e1dcedd-Arm-place-speculation-barrier-after-ERET.patch +Patch7: 5e21ce98-x86-time-update-TSC-stamp-after-deep-C-state.patch +Patch8: 5e286cce-VT-d-dont-pass-bridges-to-domain_context_mapping_one.patch +Patch9: 5e318cd4-x86-apic-fix-disabling-LVT0.patch +Patch10: 5e344c11-x86-HVM-relinquish-resources-from-domain_destroy.patch +Patch11: 5e3bd385-EFI-recheck-variable-name-strings.patch +Patch12: 5e3bd3d1-EFI-dont-leak-heap-VIA-XEN_EFI_get_next_variable_name.patch +Patch13: 5e3bd3f8-xmalloc-guard-against-overflow.patch +Patch14: 5e46e090-x86-smp-reset-x2apic_enabled-in-smp_send_stop.patch +Patch15: 5e4c00ef-VT-d-check-full-RMRR-for-E820-reserved.patch +Patch16: 5e4d4f5b-sched-fix-get_cpu_idle_time-with-core-sched.patch +Patch17: 5e4e614d-x86-spec-ctrl-no-xen-also-disables-branch-hardening.patch # Our platform specific patches Patch400: xen-destdir.patch Patch401: vif-bridge-no-iptables.patch @@ -198,7 +214,6 @@ Patch465: xen.libxl.dmmd.patch Patch466: libxl.set-migration-constraints-from-cmdline.patch Patch467: xenstore-run-in-studomain.patch -Patch468: libxl.lock-qemu-machine-for-hvm.patch Patch469: libxl.helper_done-crash.patch Patch470: libxl.LIBXL_HOTPLUG_TIMEOUT.patch # python3 conversion patches @@ -231,6 +246,7 @@ %package libs Summary: Xen Virtualization: Libraries +License: GPL-2.0-only Group: System/Kernel %description libs @@ -254,6 +270,7 @@ %package tools Summary: Xen Virtualization: Control tools for domain 0 +License: GPL-2.0-only Group: System/Kernel %ifarch x86_64 %if 0%{?suse_version} >= 1315 @@ -296,10 +313,36 @@ Ian Pratt <ian.pr...@cl.cam.ac.uk> +%ifarch x86_64 +%package tools-xendomains-wait-disk +Summary: Adds a new xendomains-wait-disks.service +License: GPL-3.0+ +Group: System/Kernel +Requires: %{name}-tools = %{version}-%{release} +Requires: coreutils +Requires: sed +Requires: vim +BuildArch: noarch + +%description tools-xendomains-wait-disk +This package adds a new service named xendomains-wait-disks.service, +that simply calls xendomains-wait-disks. xendomains-wait-disks script +loops checking for the presence of every disk used by domU that +xendomains.service will try to launch. The script returns when +all disks become available or xendomains-wait-disks.service expires. + +xendomains-wait-disks.service has the same dependencies as +xendomains.service, but it adds itself as a Wanted service for xendomains. +If xendomains-wait-disks.service fails, xendomains.service is launched anyway. + +https://github.com/luizluca/xen-tools-xendomains-wait-disk +%endif + %endif %package tools-domU Summary: Xen Virtualization: Control tools for domain U +License: GPL-2.0-only Group: System/Kernel Conflicts: %{name}-tools Requires: %{name}-libs = %{version}-%{release} @@ -320,6 +363,7 @@ %package devel Summary: Xen Virtualization: Headers and libraries for development +License: GPL-2.0-only Group: System/Kernel Requires: %{name}-libs = %{version} Requires: libuuid-devel @@ -342,6 +386,7 @@ %package doc-html Summary: Xen Virtualization: HTML documentation +License: GPL-2.0-only Group: Documentation/HTML %description doc-html @@ -367,6 +412,18 @@ %patch3 -p1 %patch4 -p1 %patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 # Our platform specific patches %patch400 -p1 %patch401 -p1 @@ -398,7 +455,6 @@ %patch465 -p1 %patch466 -p1 %patch467 -p1 -%patch468 -p1 %patch469 -p1 %patch470 -p1 # python3 conversion patches @@ -776,6 +832,32 @@ exec %{_bindir}/qemu-system-i386 "$@" EOF chmod 0755 %{buildroot}/usr/lib/xen/bin/qemu-system-i386 +# +unit='%{_libexecdir}/%{name}/bin/xendomains-wait-disks' +mkdir -vp '%{buildroot}%{_libexecdir}/%{name}/bin' +cp -avL '%{SOURCE10172}' "%{buildroot}${unit}" +mkdir xendomains-wait-disk +cp -avL '%{SOURCE10173}' xendomains-wait-disk/LICENSE +cp -avL '%{SOURCE10174}' xendomains-wait-disk/README.md +tee %{buildroot}%{_unitdir}/xendomains-wait-disks.service <<'_EOS_' +[Unit] +Description=Xendomains - for those machines that will start, wait for their disks to apear +Requires=proc-xen.mount xenstored.service +After=proc-xen.mount xenstored.service xenconsoled.service xen-init-dom0.service +After=network-online.target +After=remote-fs.target +Before=xendomains.service +ConditionPathExists=/proc/xen/capabilities + +[Service] +Type=oneshot +ExecStart=${unit} +TimeoutSec=5min + +[Install] +WantedBy=xendomains.service +_EOS_ +# %endif # Stubdom @@ -796,6 +878,7 @@ # xen-utils make -C tools/xen-utils-0.1 install DESTDIR=%{buildroot} XEN_INTREE_BUILD=yes XEN_ROOT=$PWD install -m755 %SOURCE37 %{buildroot}/usr/sbin/xen2libvirt +install -m755 %SOURCE10183 %{buildroot}/usr/sbin/xen_maskcalc rm -f %{buildroot}/etc/xen/README* # Example config @@ -968,6 +1051,7 @@ %endif /usr/sbin/xl /usr/sbin/xen2libvirt +/usr/sbin/xen_maskcalc %ifarch %ix86 x86_64 /usr/sbin/xen-hptool /usr/sbin/xen-hvmcrash @@ -1000,6 +1084,9 @@ /usr/lib/supportconfig/plugins/xen %{_libexecdir}/xen %exclude %{_libexecdir}/%{name}-tools-domU +%ifarch x86_64 +%exclude %{_libexecdir}/%{name}/bin/xendomains-wait-disks +%endif %{_fillupdir}/sysconfig.pciback %{_fillupdir}/sysconfig.xencommons %{_fillupdir}/sysconfig.xendomains @@ -1024,6 +1111,7 @@ %config /etc/modprobe.d/xen_loop.conf %config %{_unitdir} %exclude %{_unitdir}/%{name}-vcpu-watch.service +%exclude %{_unitdir}/xendomains-wait-disks.service %config %{with_systemd_modules_load} %dir /etc/modprobe.d /etc/bash_completion.d/xl.sh @@ -1086,6 +1174,13 @@ %{_libdir}/ocaml/xentoollog/*.cmi %endif +%ifarch x86_64 +%files tools-xendomains-wait-disk +%license xendomains-wait-disk/LICENSE +%doc xendomains-wait-disk/README.md +%config %{_unitdir}/xendomains-wait-disks.service +%config %attr(0755,root,root) %{_libexecdir}/%{name}/bin/xendomains-wait-disks +%endif # with_dom0_support %endif ++++++ 5e1da013-IRQ-u16-is-too-narrow-for-evtchn.patch ++++++ --- /var/tmp/diff_new_pack.5nXBtx/_old 2020-02-25 16:03:19.216229063 +0100 +++ /var/tmp/diff_new_pack.5nXBtx/_new 2020-02-25 16:03:19.216229063 +0100 @@ -15,9 +15,30 @@ Signed-off-by: Jan Beulich <jbeul...@suse.com> Acked-by: Andrew Cooper <andrew.coop...@citrix.com> +# Commit b4194711ffaffa5e63d986338fb8d4020fa6bad1 +# Date 2020-01-14 16:06:27 +0100 +# Author Jan Beulich <jbeul...@suse.com> +# Committer Jan Beulich <jbeul...@suse.com> +Arm: fix build after 892b9dcebdb7 + +"IRQ: u16 is too narrow for an event channel number" introduced a use of +evetchn_port_t, but its typedef apparently surfaces indirectly here only +on x86. + +Signed-off-by: Jan Beulich <jbeul...@suse.com> +Acked-by: Andrew Cooper <andrew.coop...@citrix.com> + --- a/xen/include/xen/irq.h +++ b/xen/include/xen/irq.h -@@ -127,9 +127,10 @@ struct vcpu; +@@ -8,6 +8,7 @@ + #include <xen/list.h> + #include <asm/regs.h> + #include <asm/hardirq.h> ++#include <public/event_channel.h> + + struct irqaction { + void (*handler)(int, void *, struct cpu_user_regs *); +@@ -127,9 +128,10 @@ struct vcpu; struct pirq { int pirq; ++++++ 5e1dcedd-Arm-place-speculation-barrier-after-ERET.patch ++++++ # Commit c7de94fd6ec5aba53ce5b8fd6ceb6031c53bb28d # Date 2020-01-14 14:23:25 +0000 # Author Julien Grall <jul...@xen.org> # Committer Julien Grall <jul...@xen.org> xen/arm: Place a speculation barrier sequence following an eret instruction Some CPUs can speculate past an ERET instruction and potentially perform speculative accesses to memory before processing the exception return. Since the register state is often controlled by lower privilege level at the point of an ERET, this could potentially be used as part of a side-channel attack. Newer CPUs may implement a new SB barrier instruction which acts as an architected speculation barrier. For current CPUs, the sequence DSB; ISB is known to prevent speculation. The latter sequence is heavier than SB but it would never be executed (this is speculation after all!). Introduce a new macro 'sb' that could be used when a speculation barrier is required. For now it is using dsb; isb but this could easily be updated to cater SB in the future. This is XSA-312. Signed-off-by: Julien Grall <jul...@xen.org> --- a/xen/arch/arm/arm32/entry.S +++ b/xen/arch/arm/arm32/entry.S @@ -426,6 +426,7 @@ return_to_hypervisor: add sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ clrex eret + sb /* * struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next) --- a/xen/arch/arm/arm64/entry.S +++ b/xen/arch/arm/arm64/entry.S @@ -354,6 +354,7 @@ guest_sync: */ mov x1, xzr eret + sb check_wa2: /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */ @@ -393,6 +394,7 @@ wa2_end: #endif /* !CONFIG_ARM_SSBD */ mov x0, xzr eret + sb guest_sync_slowpath: /* * x0/x1 may have been scratch by the fast path above, so avoid @@ -457,6 +459,7 @@ return_from_trap: ldr lr, [sp], #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ eret + sb /* * Consume pending SError generated by the guest if any. --- a/xen/include/asm-arm/macros.h +++ b/xen/include/asm-arm/macros.h @@ -20,4 +20,13 @@ .endr .endm + /* + * Speculative barrier + * XXX: Add support for the 'sb' instruction + */ + .macro sb + dsb nsh + isb + .endm + #endif /* __ASM_ARM_MACROS_H */ ++++++ 5e21ce98-x86-time-update-TSC-stamp-after-deep-C-state.patch ++++++ # Commit bbf283f853f8c0e4d29248dd44d3b0e0abc07629 # Date 2020-01-17 16:11:20 +0100 # Author Igor Druzhinin <igor.druzhi...@citrix.com> # Committer Jan Beulich <jbeul...@suse.com> x86/time: update TSC stamp on restore from deep C-state If ITSC is not available on CPU (e.g if running nested as PV shim) then X86_FEATURE_NONSTOP_TSC is not advertised in certain cases, i.e. all AMD and some old Intel processors. In which case TSC would need to be restored on CPU from platform time by Xen upon exiting C-states. As platform time might be behind the last TSC stamp recorded for the current CPU, invariant of TSC stamp being always behind local TSC counter is violated. This has an effect of get_s_time() going negative resulting in eventual system hang or crash. Fix this issue by updating local TSC stamp along with TSC counter write. Signed-off-by: Igor Druzhinin <igor.druzhi...@citrix.com> Reviewed-by: Roger Pau Monné <roger....@citrix.com> Acked-by: Jan Beulich <jbeul...@suse.com> --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -857,10 +857,16 @@ u64 stime2tsc(s_time_t stime) void cstate_restore_tsc(void) { + struct cpu_time *t = &this_cpu(cpu_time); + if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) return; - write_tsc(stime2tsc(read_platform_stime(NULL))); + t->stamp.master_stime = read_platform_stime(NULL); + t->stamp.local_tsc = stime2tsc(t->stamp.master_stime); + t->stamp.local_stime = t->stamp.master_stime; + + write_tsc(t->stamp.local_tsc); } /*************************************************************************** ++++++ 5e286cce-VT-d-dont-pass-bridges-to-domain_context_mapping_one.patch ++++++ # Commit a4d457fd59f4ebfb524aec82cb6a3030087914ca # Date 2020-01-22 16:39:58 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> VT-d: don't pass bridge devices to domain_context_mapping_one() When passed a non-NULL pdev, the function does an owner check when it finds an already existing context mapping. Bridges, however, don't get passed through to guests, and hence their owner is always going to be Dom0, leading to the assigment of all but one of the function of multi- function PCI devices behind bridges to fail. Reported-by: Marek Marczykowski-Górecki <marma...@invisiblethingslab.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: Roger Pau Monné <roger....@citrix.com> Reviewed-by: Kevin Tian <kevin.t...@intel.com> --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -1498,18 +1498,28 @@ static int domain_context_mapping(struct if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 ) break; + /* + * Mapping a bridge should, if anything, pass the struct pci_dev of + * that bridge. Since bridges don't normally get assigned to guests, + * their owner would be the wrong one. Pass NULL instead. + */ ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pci_get_pdev(seg, bus, devfn)); + NULL); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different * requester-id. It may originate from devfn=0 on the secondary bus * behind the bridge. Map that id as well if we didn't already. + * + * Somewhat similar as for bridges, we don't want to pass a struct + * pci_dev here - there may not even exist one for this (secbus,0,0) + * tuple. If there is one, without properly working device groups it + * may again not have the correct owner. */ if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, - pci_get_pdev(seg, secbus, 0)); + NULL); break; ++++++ 5e318cd4-x86-apic-fix-disabling-LVT0.patch ++++++ # Commit 782b48b7f7319c07b044606d67a60875e53dd05b # Date 2020-01-29 14:47:00 +0100 # Author Roger Pau Monné <roger....@citrix.com> # Committer Jan Beulich <jbeul...@suse.com> x86/apic: fix disabling LVT0 in disconnect_bsp_APIC The Intel SDM states: "When an illegal vector value (0 to 15) is written to a LVT entry and the delivery mode is Fixed (bits 8-11 equal 0), the APIC may signal an illegal vector error, without regard to whether the mask bit is set or whether an interrupt is actually seen on the input." And that's exactly what's currently done in disconnect_bsp_APIC when virt_wire_setup is true and LVT LINT0 is being masked. By writing only APIC_LVT_MASKED Xen is actually setting the vector to 0 and the delivery mode to Fixed (0), and hence it triggers an APIC error even when the LVT entry is masked. This would usually manifest when Xen is being shut down, as that's where disconnect_bsp_APIC is called: (XEN) APIC error on CPU0: 40(00) Fix this by calling clear_local_APIC prior to setting the LVT LINT registers which already clear LVT LINT0, and hence the troublesome write can be avoided as the register is already cleared. Reported-by: Andrew Cooper <andrew.coop...@citrix.com> Signed-off-by: Roger Pau Monné <roger....@citrix.com> Reviewed-by: Jan Beulich <jbeul...@suse.com> --- a/xen/arch/x86/apic.c +++ b/xen/arch/x86/apic.c @@ -259,6 +259,8 @@ void disconnect_bsp_APIC(int virt_wire_s /* Go back to Virtual Wire compatibility mode */ unsigned long value; + clear_local_APIC(); + /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; @@ -276,10 +278,6 @@ void disconnect_bsp_APIC(int virt_wire_s value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write(APIC_LVT0, value); } - else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } /* For LVT1 make it edge triggered, active high, nmi and enabled */ value = apic_read(APIC_LVT1); ++++++ 5e344c11-x86-HVM-relinquish-resources-from-domain_destroy.patch ++++++ # Commit b3344bb1cae0c9ac22a57db8ecca488ad0e4a66d # Date 2020-01-31 16:47:29 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> x86/HVM: relinquish resources also from hvm_domain_destroy() Domain creation failure paths don't call domain_relinquish_resources(), yet allocations and alike done from hvm_domain_initialize() need to be undone nevertheless. Call the function also from hvm_domain_destroy(), after making sure all descendants are idempotent. Note that while viridian_{domain,vcpu}_deinit() were already used in ways suggesting they're idempotent, viridian_time_vcpu_deinit() actually wasn't: One can't kill a timer that was never initialized. For hvm_destroy_all_ioreq_servers()'s purposes make relocate_portio_handler() return whether the to be relocated port range was actually found. This seems cheaper than introducing a flag into struct hvm_domain's ioreq_server sub-structure. In hvm_domain_initialise() additionally - use XFREE() also to replace adjacent xfree(), - use hvm_domain_relinquish_resources() as being idempotent now. There as well as in hvm_domain_destroy() the explicit call to rtc_deinit() isn't needed anymore. In hvm_domain_relinquish_resources() additionally drop a no longer relevant if(). Fixes: e7a9b5e72f26 ("viridian: separately allocate domain and vcpu structures") Fixes: 26fba3c85571 ("viridian: add implementation of synthetic timers") Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: Roger Pau Monné <roger....@citrix.com> Acked-by: Andrew Cooper <andrew.coop...@citrix.com> Reviewed-by: Paul Durrant <pdurr...@amazon.com> --- a/xen/arch/x86/hvm/hpet.c +++ b/xen/arch/x86/hvm/hpet.c @@ -751,7 +751,7 @@ void hpet_deinit(struct domain *d) int i; HPETState *h = domain_vhpet(d); - if ( !has_vhpet(d) ) + if ( !has_vhpet(d) || !d->arch.hvm.pl_time || !h->stime_freq ) return; write_lock(&h->lock); @@ -763,6 +763,8 @@ void hpet_deinit(struct domain *d) for ( i = 0; i < HPET_TIMER_NUM; i++ ) if ( timer_enabled(h, i) ) hpet_stop_timer(h, i, guest_time); + + h->hpet.config = 0; } write_unlock(&h->lock); --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -695,24 +695,24 @@ int hvm_domain_initialise(struct domain return 0; fail2: - rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); fail1: if ( is_hardware_domain(d) ) xfree(d->arch.hvm.io_bitmap); - xfree(d->arch.hvm.io_handler); - xfree(d->arch.hvm.params); - xfree(d->arch.hvm.pl_time); - xfree(d->arch.hvm.irq); + XFREE(d->arch.hvm.io_handler); + XFREE(d->arch.hvm.params); + XFREE(d->arch.hvm.pl_time); + XFREE(d->arch.hvm.irq); fail0: hvm_destroy_cacheattr_region_list(d); destroy_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0); fail: - viridian_domain_deinit(d); + hvm_domain_relinquish_resources(d); return rc; } +/* This function and all its descendants need to be to be idempotent. */ void hvm_domain_relinquish_resources(struct domain *d) { if ( hvm_funcs.nhvm_domain_relinquish_resources ) @@ -726,11 +726,8 @@ void hvm_domain_relinquish_resources(str /* Stop all asynchronous timer actions. */ rtc_deinit(d); - if ( d->vcpu != NULL && d->vcpu[0] != NULL ) - { - pmtimer_deinit(d); - hpet_deinit(d); - } + pmtimer_deinit(d); + hpet_deinit(d); } void hvm_domain_destroy(struct domain *d) @@ -738,13 +735,19 @@ void hvm_domain_destroy(struct domain *d struct list_head *ioport_list, *tmp; struct g2m_ioport *ioport; + /* + * This function would not be called when domain initialization fails + * (late enough), so do so here. This requires the function and all its + * descendants to be idempotent. + */ + hvm_domain_relinquish_resources(d); + XFREE(d->arch.hvm.io_handler); XFREE(d->arch.hvm.params); hvm_destroy_cacheattr_region_list(d); hvm_funcs.domain_destroy(d); - rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); --- a/xen/arch/x86/hvm/intercept.c +++ b/xen/arch/x86/hvm/intercept.c @@ -300,7 +300,7 @@ void register_portio_handler(struct doma handler->portio.action = action; } -void relocate_portio_handler(struct domain *d, unsigned int old_port, +bool relocate_portio_handler(struct domain *d, unsigned int old_port, unsigned int new_port, unsigned int size) { unsigned int i; @@ -317,9 +317,11 @@ void relocate_portio_handler(struct doma (handler->portio.size = size) ) { handler->portio.port = new_port; - break; + return true; } } + + return false; } bool_t hvm_mmio_internal(paddr_t gpa) --- a/xen/arch/x86/hvm/ioreq.c +++ b/xen/arch/x86/hvm/ioreq.c @@ -1228,6 +1228,9 @@ void hvm_destroy_all_ioreq_servers(struc struct hvm_ioreq_server *s; unsigned int id; + if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) ) + return; + spin_lock_recursive(&d->arch.hvm.ioreq_server.lock); /* No need to domain_pause() as the domain is being torn down */ --- a/xen/arch/x86/hvm/pmtimer.c +++ b/xen/arch/x86/hvm/pmtimer.c @@ -373,7 +373,7 @@ void pmtimer_deinit(struct domain *d) { PMTState *s = &d->arch.hvm.pl_time->vpmt; - if ( !has_vpm(d) ) + if ( !has_vpm(d) || !d->arch.hvm.pl_time || !s->vcpu ) return; kill_timer(&s->timer); --- a/xen/arch/x86/hvm/rtc.c +++ b/xen/arch/x86/hvm/rtc.c @@ -836,7 +836,8 @@ void rtc_deinit(struct domain *d) { RTCState *s = domain_vrtc(d); - if ( !has_vrtc(d) ) + if ( !has_vrtc(d) || !d->arch.hvm.pl_time || + s->update_timer.status == TIMER_STATUS_invalid ) return; spin_barrier(&s->lock); --- a/xen/arch/x86/hvm/viridian/time.c +++ b/xen/arch/x86/hvm/viridian/time.c @@ -566,6 +566,8 @@ void viridian_time_vcpu_deinit(const str { struct viridian_stimer *vs = &vv->stimer[i]; + if ( !vs->v ) + continue; kill_timer(&vs->timer); vs->v = NULL; } --- a/xen/include/asm-x86/hvm/io.h +++ b/xen/include/asm-x86/hvm/io.h @@ -112,7 +112,7 @@ void register_portio_handler( struct domain *d, unsigned int port, unsigned int size, portio_action_t action); -void relocate_portio_handler( +bool relocate_portio_handler( struct domain *d, unsigned int old_port, unsigned int new_port, unsigned int size); ++++++ 5e3bd385-EFI-recheck-variable-name-strings.patch ++++++ # Commit ad38db5852f0e30d90c93c6a62b754f2861549e0 # Date 2020-02-06 09:51:17 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> EFI: re-check {get,set}-variable name strings after copying in A malicious guest given permission to invoke XENPF_efi_runtime_call may play with the strings underneath Xen sizing them and copying them in. Guard against this by re-checking the copyied in data for consistency with the initial sizing. At the same time also check that the actual copy-in is in fact successful, and switch to the lighter weight non- checking flavor of the function. Reported-by: Ilja Van Sprundel <ivansprun...@ioactive.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: George Dunlap <george.dun...@citrix.com> --- a/xen/common/efi/boot.c +++ b/xen/common/efi/boot.c @@ -280,16 +280,6 @@ static int __init wstrncmp(const CHAR16 return n ? *s1 - *s2 : 0; } -static const CHAR16 *__init wmemchr(const CHAR16 *s, CHAR16 c, UINTN n) -{ - while ( n && *s != c ) - { - --n; - ++s; - } - return n ? s : NULL; -} - static CHAR16 *__init s2w(union string *str) { const char *s = str->s; --- a/xen/common/efi/efi.h +++ b/xen/common/efi/efi.h @@ -39,3 +39,5 @@ extern UINT64 efi_boot_max_var_store_siz extern UINT64 efi_apple_properties_addr; extern UINTN efi_apple_properties_len; + +const CHAR16 *wmemchr(const CHAR16 *s, CHAR16 c, UINTN n); --- a/xen/common/efi/runtime.c +++ b/xen/common/efi/runtime.c @@ -194,7 +194,18 @@ void efi_reset_system(bool warm) } #endif /* CONFIG_ARM */ -#endif + +const CHAR16 *wmemchr(const CHAR16 *s, CHAR16 c, UINTN n) +{ + while ( n && *s != c ) + { + --n; + ++s; + } + return n ? s : NULL; +} + +#endif /* COMPAT */ #ifndef CONFIG_ARM /* TODO - disabled until implemented on ARM */ int efi_get_info(uint32_t idx, union xenpf_efi_info *info) @@ -465,7 +476,12 @@ int efi_runtime_call(struct xenpf_efi_ru name = xmalloc_array(CHAR16, ++len); if ( !name ) return -ENOMEM; - __copy_from_guest(name, op->u.get_variable.name, len); + if ( __copy_from_guest(name, op->u.get_variable.name, len) || + wmemchr(name, 0, len) != name + len - 1 ) + { + xfree(name); + return -EIO; + } size = op->u.get_variable.size; if ( size ) @@ -513,7 +529,12 @@ int efi_runtime_call(struct xenpf_efi_ru name = xmalloc_array(CHAR16, ++len); if ( !name ) return -ENOMEM; - __copy_from_guest(name, op->u.set_variable.name, len); + if ( __copy_from_guest(name, op->u.set_variable.name, len) || + wmemchr(name, 0, len) != name + len - 1 ) + { + xfree(name); + return -EIO; + } data = xmalloc_bytes(op->u.set_variable.size); if ( !data ) ++++++ 5e3bd3d1-EFI-dont-leak-heap-VIA-XEN_EFI_get_next_variable_name.patch ++++++ # Commit 4783ee894f6bfb0f4deec9f1fe8e7faceafaa1a2 # Date 2020-02-06 09:52:33 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> EFI: don't leak heap contents through XEN_EFI_get_next_variable_name Commit 1f4eb9d27d0e ("EFI: fix getting EFI variable list on some systems") switched to using the caller provided size for the copy-out without making sure the copied buffer is properly scrubbed. Reported-by: Ilja Van Sprundel <ivansprun...@ioactive.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: George Dunlap <george.dun...@citrix.com> --- a/xen/common/efi/runtime.c +++ b/xen/common/efi/runtime.c @@ -571,7 +571,7 @@ int efi_runtime_call(struct xenpf_efi_ru return -EINVAL; size = op->u.get_next_variable_name.size; - name.raw = xmalloc_bytes(size); + name.raw = xzalloc_bytes(size); if ( !name.raw ) return -ENOMEM; if ( copy_from_guest(name.raw, op->u.get_next_variable_name.name, ++++++ 5e3bd3f8-xmalloc-guard-against-overflow.patch ++++++ # Commit cf38b4926e2b55d1d7715cff5095a7444f5ed42d # Date 2020-02-06 09:53:12 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> xmalloc: guard against integer overflow There are hypercall handling paths (EFI ones are what this was found with) needing to allocate buffers of a caller specified size. This is generally fine, as our page allocator enforces an upper bound on all allocations. However, certain extremely large sizes could, when adding in allocator overhead, result in an apparently tiny allocation size, which would typically result in either a successful allocation, but a severe buffer overrun when using that memory block, or in a crash right in the allocator code. Reported-by: Ilja Van Sprundel <ivansprun...@ioactive.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: George Dunlap <george.dun...@citrix.com> --- a/xen/common/xmalloc_tlsf.c +++ b/xen/common/xmalloc_tlsf.c @@ -378,7 +378,17 @@ void *xmem_pool_alloc(unsigned long size int fl, sl; unsigned long tmp_size; - size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size); + if ( size < MIN_BLOCK_SIZE ) + size = MIN_BLOCK_SIZE; + else + { + tmp_size = ROUNDUP_SIZE(size); + /* Guard against overflow. */ + if ( tmp_size < size ) + return NULL; + size = tmp_size; + } + /* Rounding up the requested size and calculating fl and sl */ spin_lock(&pool->lock); @@ -594,6 +604,10 @@ void *_xmalloc(unsigned long size, unsig align = MEM_ALIGN; size += align - MEM_ALIGN; + /* Guard against overflow. */ + if ( size < align - MEM_ALIGN ) + return NULL; + if ( !xenpool ) tlsf_init(); @@ -646,6 +660,10 @@ void *_xrealloc(void *ptr, unsigned long unsigned long tmp_size = size + align - MEM_ALIGN; const struct bhdr *b; + /* Guard against overflow. */ + if ( tmp_size < size ) + return NULL; + if ( tmp_size < PAGE_SIZE ) tmp_size = (tmp_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(tmp_size); ++++++ 5e46e090-x86-smp-reset-x2apic_enabled-in-smp_send_stop.patch ++++++ # Commit 8b1002ab037aeacdece7723c07ab35ca16c1e22e # Date 2020-02-14 18:01:52 +0000 # Author David Woodhouse <d...@amazon.co.uk> # Committer Andrew Cooper <andrew.coop...@citrix.com> x86/smp: reset x2apic_enabled in smp_send_stop() Just before smp_send_stop() re-enables interrupts when shutting down for reboot or kexec, it calls __stop_this_cpu() which in turn calls disable_local_APIC(), which puts the APIC back in to the mode Xen found it in at boot. If that means turning x2APIC off and going back into xAPIC mode, then a timer interrupt occurring just after interrupts come back on will lead to a GP# when apic_timer_interrupt() attempts to ack the IRQ through the EOI register in x2APIC MSR 0x80b: (XEN) Executing kexec image on cpu0 (XEN) ----[ Xen-4.14-unstable x86_64 debug=n Not tainted ]---- (XEN) CPU: 0 (XEN) RIP: e008:[<ffff82d08026c139>] apic_timer_interrupt+0x29/0x40 (XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor (XEN) rax: 0000000000000000 rbx: 00000000000000fa rcx: 000000000000080b ... (XEN) Xen code around <ffff82d08026c139> (apic_timer_interrupt+0x29/0x40): (XEN) c0 b9 0b 08 00 00 89 c2 <0f> 30 31 ff e9 0e c9 fb ff 0f 1f 40 00 66 2e 0f ... (XEN) Xen call trace: (XEN) [<ffff82d08026c139>] R apic_timer_interrupt+0x29/0x40 (XEN) [<ffff82d080283825>] S do_IRQ+0x95/0x750 ... (XEN) [<ffff82d0802a0ad2>] S smp_send_stop+0x42/0xd0 We can't clear the global x2apic_enabled variable in disable_local_APIC() itself because that runs on each CPU. Instead, correct it (by using current_local_apic_mode()) in smp_send_stop() while interrupts are still disabled immediately after calling __stop_this_cpu() for the boot CPU, after all other CPUs have been stopped. cf: d639bdd9bbe ("x86/apic: Disable the LAPIC later in smp_send_stop()") ... which didn't quite fix it completely. Signed-off-by: David Woodhouse <d...@amazon.co.uk> Reviewed-by: Roger Pau Monné <roger....@citrix.com> --- a/xen/arch/x86/smp.c +++ b/xen/arch/x86/smp.c @@ -325,6 +325,7 @@ void smp_send_stop(void) disable_IO_APIC(); hpet_disable(); __stop_this_cpu(); + x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC); local_irq_enable(); } } ++++++ 5e4c00ef-VT-d-check-full-RMRR-for-E820-reserved.patch ++++++ # Commit d6573bc6e6b7d95bb9de8471a6bfd7048ebc50f3 # Date 2020-02-18 16:21:19 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> VT-d: check all of an RMRR for being E820-reserved Checking just the first and last page is not sufficient (and redundant for single-page regions). As we don't need to care about IA64 anymore, use an x86-specific function to get this done without looping over each individual page. Signed-off-by: Jan Beulich <jbeul...@suse.com> Reviewed-by: Roger Pau Monné <roger....@citrix.com> Reviewed-by: Kevin Tian <kevin.t...@intel.com> --- a/xen/drivers/passthrough/vtd/dmar.c +++ b/xen/drivers/passthrough/vtd/dmar.c @@ -28,6 +28,7 @@ #include <xen/pci.h> #include <xen/pci_regs.h> #include <asm/atomic.h> +#include <asm/e820.h> #include <asm/string.h> #include "dmar.h" #include "iommu.h" @@ -631,14 +632,11 @@ acpi_parse_one_rmrr(struct acpi_dmar_hea * not properly represented in the system memory map and * inform the user */ - if ( (!page_is_ram_type(paddr_to_pfn(base_addr), RAM_TYPE_RESERVED)) || - (!page_is_ram_type(paddr_to_pfn(end_addr), RAM_TYPE_RESERVED)) ) - { + if ( !e820_all_mapped(base_addr, end_addr + 1, RAM_TYPE_RESERVED) ) printk(XENLOG_WARNING VTDPREFIX " RMRR address range %"PRIx64"..%"PRIx64" not in reserved memory;" " need \"iommu_inclusive_mapping=1\"?\n", base_addr, end_addr); - } rmrru = xzalloc(struct acpi_rmrr_unit); if ( !rmrru ) ++++++ 5e4d4f5b-sched-fix-get_cpu_idle_time-with-core-sched.patch ++++++ # Commit 132cbe8f35632fb2fea0625ee8fdda53a19a1645 # Date 2020-02-19 16:08:11 +0100 # Author Juergen Gross <jgr...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> sched: fix get_cpu_idle_time() with core scheduling get_cpu_idle_time() is calling vcpu_runstate_get() for an idle vcpu. With core scheduling active this is fragile, as idle vcpus are assigned to other scheduling units temporarily, and that assignment is changed in some cases without holding the scheduling lock, and vcpu_runstate_get() is using v->sched_unit as parameter for unit_schedule_[un]lock_irq(), resulting in an ASSERT() triggering in unlock in case v->sched_unit has changed meanwhile. Fix that by using a local unit variable holding the correct unit. Signed-off-by: Juergen Gross <jgr...@suse.com> Reviewed-by: Dario Faggioli <dfaggi...@suse.com> --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -305,17 +305,26 @@ void vcpu_runstate_get(struct vcpu *v, s { spinlock_t *lock; s_time_t delta; + struct sched_unit *unit; rcu_read_lock(&sched_res_rculock); - lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit); + /* + * Be careful in case of an idle vcpu: the assignment to a unit might + * change even with the scheduling lock held, so be sure to use the + * correct unit for locking in order to avoid triggering an ASSERT() in + * the unlock function. + */ + unit = is_idle_vcpu(v) ? get_sched_res(v->processor)->sched_unit_idle + : v->sched_unit; + lock = likely(v == current) ? NULL : unit_schedule_lock_irq(unit); memcpy(runstate, &v->runstate, sizeof(*runstate)); delta = NOW() - runstate->state_entry_time; if ( delta > 0 ) runstate->time[runstate->state] += delta; if ( unlikely(lock != NULL) ) - unit_schedule_unlock_irq(lock, v->sched_unit); + unit_schedule_unlock_irq(lock, unit); rcu_read_unlock(&sched_res_rculock); } ++++++ 5e4e614d-x86-spec-ctrl-no-xen-also-disables-branch-hardening.patch ++++++ # Commit e6ca7afcf2ddeb72beade853ccd6fa3332210014 # Date 2020-02-20 11:37:01 +0100 # Author Jan Beulich <jbeul...@suse.com> # Committer Jan Beulich <jbeul...@suse.com> x86: "spec-ctrl=no-xen" should also disable branch hardening This is controlling Xen behavior alone, after all. Reported-by: Jin Nan Wang <jnw...@suse.com> Signed-off-by: Jan Beulich <jbeul...@suse.com> Acked-by: Andrew Cooper <andrew.coop...@citrix.com> --- a/xen/arch/x86/spec_ctrl.c +++ b/xen/arch/x86/spec_ctrl.c @@ -97,8 +97,6 @@ static int __init parse_spec_ctrl(const if ( opt_pv_l1tf_domu < 0 ) opt_pv_l1tf_domu = 0; - opt_branch_harden = false; - if ( opt_tsx == -1 ) opt_tsx = -3; @@ -113,6 +111,7 @@ static int __init parse_spec_ctrl(const opt_ibpb = false; opt_ssbd = false; opt_l1d_flush = 0; + opt_branch_harden = false; } else if ( val > 0 ) rc = -EINVAL; ++++++ xen.bug1026236.suse_vtsc_tolerance.patch ++++++ --- /var/tmp/diff_new_pack.5nXBtx/_old 2020-02-25 16:03:19.528229688 +0100 +++ /var/tmp/diff_new_pack.5nXBtx/_new 2020-02-25 16:03:19.528229688 +0100 @@ -20,7 +20,7 @@ unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */ DEFINE_SPINLOCK(rtc_lock); unsigned long pit0_ticks; -@@ -2223,6 +2226,7 @@ int tsc_set_info(struct domain *d, +@@ -2229,6 +2232,7 @@ int tsc_set_info(struct domain *d, switch ( tsc_mode ) { @@ -28,7 +28,7 @@ case TSC_MODE_DEFAULT: case TSC_MODE_ALWAYS_EMULATE: d->arch.vtsc_offset = get_s_time() - elapsed_nsec; -@@ -2236,8 +2240,26 @@ int tsc_set_info(struct domain *d, +@@ -2242,8 +2246,26 @@ int tsc_set_info(struct domain *d, * When a guest is created, gtsc_khz is passed in as zero, making * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation. */ ++++++ xen_maskcalc.py ++++++ #!/usr/bin/python3 # Xen Mask Calculator - Calculate CPU masking information based on cpuid(1) # Copyright (C) 2017 Armando Vega # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import argparse import sys import os EAX1_MATCH = '0x00000001 0x00:' EAX7_MATCH = '0x00000007 0x00:' EXP_LINELN = 76 libxl_names_ecx1 = [] libxl_names_edx1 = [] libvirt_names_ecx1 = [] libvirt_names_edx1 = [] libxl_names_ebx7 = [] libxl_names_ecx7 = [] libvirt_names_ebx7 = [] libvirt_names_ecx7 = [] def fill_ecx1(bit, libxl, libvirt): if libxl_names_ecx1[bit]: print("ecx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ecx1[bit], libvirt_names_ecx1[bit], libxl, libvirt)) return libxl_names_ecx1[bit] = libxl libvirt_names_ecx1[bit] = libvirt def fill_edx1(bit, libxl, libvirt): if libxl_names_edx1[bit]: print("edx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_edx1[bit], libvirt_names_edx1[bit], libxl, libvirt)) return libxl_names_edx1[bit] = libxl libvirt_names_edx1[bit] = libvirt def fill_ebx7(bit, libxl, libvirt): if libxl_names_ebx7[bit]: print("edx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ebx7[bit], libvirt_names_ebx7[bit], libxl, libvirt)) return libxl_names_ebx7[bit] = libxl libvirt_names_ebx7[bit] = libvirt def fill_ecx7(bit, libxl, libvirt): if libxl_names_ecx7[bit]: print("ecx bit %s already set: libxl %s libvirt %s. Ignoring %s/%s\n" % (bit, libxl_names_ecx7[bit], libvirt_names_ecx7[bit], libxl, libvirt)) return libxl_names_ecx7[bit] = libxl libvirt_names_ecx7[bit] = libvirt def fill_bit_names(): for i in range(0,32): libxl_names_ecx1.append(None) libxl_names_edx1.append(None) libxl_names_ebx7.append(None) libxl_names_ecx7.append(None) libvirt_names_ecx1.append(None) libvirt_names_edx1.append(None) libvirt_names_ebx7.append(None) libvirt_names_ecx7.append(None) fill_ecx1(0, "sse3", "pni") fill_ecx1(1, "pclmulqdq", "pclmuldq") fill_ecx1(2, "dtes64", "dtes64") fill_ecx1(3, "monitor", "monitor") fill_ecx1(4, "dscpl", "ds_cpl") fill_ecx1(5, "vmx", "vmx") fill_ecx1(6, "smx", "smx") fill_ecx1(7, "est", "est") fill_ecx1(8, "tm2", "tm2") fill_ecx1(9, "ssse3", "ssse3") fill_ecx1(10, "cntxid", "cid") fill_ecx1(12, "fma", "fma") fill_ecx1(13, "cmpxchg16", "cx16") fill_ecx1(14, "xtpr", "xtpr") fill_ecx1(15, "pdcm", "pdcm") fill_ecx1(17, "pcid", "pcid") fill_ecx1(18, "dca", "dca") fill_ecx1(19, "sse4_1", "sse4.1") fill_ecx1(20, "sse4_2", "sse4.2") fill_ecx1(21, "x2apic", "x2apic") fill_ecx1(22, "movbe", "movbe") fill_ecx1(23, "popcnt", "popcnt") fill_ecx1(24, "tsc-deadline", "tsc-deadline") fill_ecx1(25, "aes", "aes") fill_ecx1(26, "xsave", "xsave") fill_ecx1(27, "osxsave", "osxsave") fill_ecx1(28, "avx", "avx") fill_ecx1(29, "f16c", "f16c") fill_ecx1(30, "rdrand", "rdrand") fill_ecx1(31, "hypervisor", "hypervisor") fill_edx1(0, "fpu", "fpu") fill_edx1(1, "vme", "vme") fill_edx1(2, "de", "de") fill_edx1(3, "pse", "pse") fill_edx1(4, "tsc", "tsc") fill_edx1(5, "msr", "msr") fill_edx1(6, "pae", "pae") fill_edx1(7, "mce", "mce") fill_edx1(8, "cmpxchg8", "cx8") fill_edx1(9, "apic", "apic") fill_edx1(11, "sysenter", "sep") fill_edx1(12, "mtrr", "mtrr") fill_edx1(13, "pge", "pge") fill_edx1(14, "mca", "mca") fill_edx1(15, "cmov", "cmov") fill_edx1(16, "pat", "pat") fill_edx1(17, "pse36", "pse36") fill_edx1(18, "psn", "pn") fill_edx1(19, "clfsh", "clflush") fill_edx1(21, "ds", "ds") fill_edx1(22, "acpi", "acpi") fill_edx1(23, "mmx", "mmx") fill_edx1(24, "fxsr", "fxsr") fill_edx1(25, "sse", "sse") fill_edx1(26, "sse2", "sse2") fill_edx1(27, "ss", "ss") fill_edx1(28, "htt", "ht") fill_edx1(29, "tm", "tm") fill_edx1(30, "ia64", "ia64") fill_edx1(31, "pbe", "pbe") fill_ebx7(0, "fsgsbase", "fsgsbase") fill_ebx7(1, "tsc_adjust", "tsc_adjust") fill_ebx7(3, "bmi1", "bmi1") fill_ebx7(4, "hle", "hle") fill_ebx7(5, "avx2", "avx2") fill_ebx7(7, "smep", "smep") fill_ebx7(8, "bmi2", "bmi2") fill_ebx7(9, "erms", "erms") fill_ebx7(10, "invpcid", "invpcid") fill_ebx7(11, "rtm", "rtm") fill_ebx7(12, "cmt", "cmt") fill_ebx7(14, "mpx", "mpx") fill_ebx7(16, "avx512f", "avx512f") fill_ebx7(17, "avx512dq", "avx512dq") fill_ebx7(18, "rdseed", "rdseed") fill_ebx7(19, "adx", "adx") fill_ebx7(20, "smap", "smap") fill_ebx7(21, "avx512-ifma", "avx512-ifma") fill_ebx7(23, "clflushopt", "clflushopt") fill_ebx7(24, "clwb", "clwb") fill_ebx7(26, "avx512pf", "avx512pf") fill_ebx7(27, "avx512er", "avx512er") fill_ebx7(28, "avx512cd", "avx512cd") fill_ebx7(29, "sha", "sha") fill_ebx7(30, "avx512bw", "avx512bw") fill_ebx7(31, "avx512vl", "avx512vl") fill_ecx7(0, "prefetchwt1", "prefetchwt1") fill_ecx7(1, "avx512-vbmi", "avx512-vbmi") fill_ecx7(2, "umip", "umip") fill_ecx7(3, "pku", "pku") fill_ecx7(4, "ospke", "ospke") fill_ecx7(6, "avx512-vbmi2", "avx512-vbmi2") fill_ecx7(8, "gfni", "gfni") fill_ecx7(9, "vaes", "vaes") fill_ecx7(10, "vpclmulqdq", "vpclmulqdq") fill_ecx7(11, "avx512-vnni", "avx512-vnni") fill_ecx7(12, "avx512-bitalg", "avx512-bitalg") fill_ecx7(14, "avx512-vpopcntdq", "avx512-vpopcntdq") fill_ecx7(22, "rdpid", "rdpid") fill_ecx7(25, "cldemote", "cldemote") def get_register_mask(regs): """ Take a list of register values and return the calculated mask """ reg_n = len(regs) mask = '' for idx in range(32): counter = 0 for reg in regs: counter += 1 if (reg & (1 << idx) > 0) else 0 # if we have all 1s or all 0s we don't mask the bit if counter == reg_n or counter == 0: mask = mask + 'x' else: mask = mask + '0' # we calculated the mask in reverse, so we reverse it again return mask[::-1] def print_xl_masking_config(nodes): """ Take a dictionary of nodes containing their registers and print out CPUID masking configuration for xl """ nomasking = 'x' * 32 libxl = [] libvirt = [] eax1_ecx_regs = [] eax1_edx_regs = [] eax7_ebx_regs = [] eax7_ecx_regs = [] for node in nodes: eax1_ecx_regs.append(nodes[node]['eax1_ecx']) eax1_edx_regs.append(nodes[node]['eax1_edx']) eax7_ebx_regs.append(nodes[node]['eax7_ebx']) eax7_ecx_regs.append(nodes[node]['eax7_ecx']) # Get masks for the EAX1 and EAX7 registers eax1_ecx_mask = get_register_mask(eax1_ecx_regs) eax1_edx_mask = get_register_mask(eax1_edx_regs) eax7_ebx_mask = get_register_mask(eax7_ebx_regs) eax7_ecx_mask = get_register_mask(eax7_ecx_regs) # Build the xl CPUID config cpuid_config = 'cpuid = [\n "0x00000001:ecx=' + eax1_ecx_mask if eax1_edx_mask != nomasking: cpuid_config += ',edx=' + eax1_edx_mask cpuid_config += '",\n' cpuid_config += ' "0x00000007,0x00:ebx=' + eax7_ebx_mask if eax7_ecx_mask != nomasking: cpuid_config += ',ecx=' + eax7_ecx_mask cpuid_config += '"\n' cpuid_config += ']' print(cpuid_config) bitnum = len(eax1_ecx_mask) while bitnum > 0: bitnum -= 1 bitval = eax1_ecx_mask[len(eax1_ecx_mask) - 1 - bitnum] if bitval == "0" and libxl_names_ecx1[bitnum]: libxl.append(libxl_names_ecx1[bitnum] + "=0") libvirt.append(libvirt_names_ecx1[bitnum]) bitnum = len(eax1_edx_mask) while bitnum > 0: bitnum -= 1 bitval = eax1_edx_mask[len(eax1_edx_mask) - 1 - bitnum] if bitval == "0" and libxl_names_edx1[bitnum]: libxl.append(libxl_names_edx1[bitnum] + "=0") libvirt.append(libvirt_names_edx1[bitnum]) bitnum = len(eax7_ebx_mask) while bitnum > 0: bitnum -= 1 bitval = eax7_ebx_mask[len(eax7_ebx_mask) - 1 - bitnum] if bitval == "0" and libxl_names_ebx7[bitnum]: libxl.append(libxl_names_ebx7[bitnum] + "=0") libvirt.append(libvirt_names_ebx7[bitnum]) bitnum = len(eax7_ecx_mask) while bitnum > 0: bitnum -= 1 bitval = eax7_ecx_mask[len(eax7_ecx_mask) - 1 - bitnum] if bitval == "0" and libxl_names_ecx7[bitnum]: libxl.append(libxl_names_ecx7[bitnum] + "=0") libvirt.append(libvirt_names_ecx7[bitnum]) if len(libxl) > 0: output = "cpuid = [ host" for i in libxl: output += "," + i output += " ]" print(output) print("<domain>") print(" <cpu>") for i in libvirt: print(" <feature policy='optional' name='%s' />" % i) print(" </cpu>") print("</domain>") def print_verbose_masking_info(nodes): """ Take a dictionary of nodes containing their registers and print out verbose mask derivation information """ eax1_ecx_regs = [] eax1_edx_regs = [] eax7_ebx_regs = [] eax7_ecx_regs = [] for node in nodes: eax1_ecx_regs.append(nodes[node]['eax1_ecx']) eax1_edx_regs.append(nodes[node]['eax1_edx']) eax7_ebx_regs.append(nodes[node]['eax7_ebx']) eax7_ecx_regs.append(nodes[node]['eax7_ecx']) print("") print('== Detailed mask derivation info ==') print("") print('EAX1 ECX registers:') for reg in eax1_ecx_regs: print('{0:032b}'.format(reg)) print('================================') print(get_register_mask(eax1_ecx_regs)) print("") print('EAX1 EDX registers:') for reg in eax1_edx_regs: print('{0:032b}'.format(reg)) print('================================') print(get_register_mask(eax1_edx_regs)) print("") print('EAX7,0 EBX registers:') for reg in eax7_ebx_regs: print('{0:032b}'.format(reg)) print('================================') print(get_register_mask(eax7_ebx_regs)) print("") print('EAX7,0 ECX registers:') for reg in eax7_ecx_regs: print('{0:032b}'.format(reg)) print('================================') print(get_register_mask(eax7_ecx_regs)) if __name__ == '__main__': epilog = """The individual 'node_files' are generated with 'cpuid -1r': server1~$ cpuid -1r > node1 server2~$ cpuid -1r > node2 server3~$ cpuid -1r > node3 ~$ {0} node1 node2 node3 Use 'zypper install cpuid' to install the cpuid.rpm. Note: Run 'cpuid' with NATIVE boot instead of dom0 to get the complete cpid value. Xen hides some bits from dom0! """.format(sys.argv[0]) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description='A utility that calculates a XEN CPUID difference mask', epilog=epilog ) parser.add_argument('node_files', nargs='*', help='Filenames of XEN node CPUID outputs') parser.add_argument('-v', '--verbose', action='store_true', help='Get detailed mask derivation information') args = parser.parse_args() if len(args.node_files) < 2: print('Need at least 2 files to do the comparison!') parser.print_help() sys.exit(1) fill_bit_names() nodes = dict() for node in args.node_files: if os.path.isfile(node): try: f = open(node) except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) sys.exit(1) else: lines = [line.strip() for line in f] eax1 = '' eax7 = '' # try to match the lines containing interesting registers # EAX1 - Processor Info and Feature Bits # EAX7 - Extended features for line in lines: if line.startswith(EAX1_MATCH): eax1 = line elif line.startswith(EAX7_MATCH): eax7 = line # if we get garbled data we should probably just give up if len(eax1) < EXP_LINELN or len(eax7) < EXP_LINELN: print('ERROR: invalid data format in file : ' + node) sys.exit(1) # check if we can actually parse the strings into integers try: eax1_ecx = int(eax1.split()[4].split('=')[1], 0) eax1_edx = int(eax1.split()[5].split('=')[1], 0) eax7_ebx = int(eax7.split()[3].split('=')[1], 0) eax7_ecx = int(eax7.split()[4].split('=')[1], 0) except ValueError: print('ERROR: invalid data format in file: ' + node) sys.exit(1) nodes[node] = dict() nodes[node]['eax1_ecx'] = eax1_ecx nodes[node]['eax1_edx'] = eax1_edx nodes[node]['eax7_ebx'] = eax7_ebx nodes[node]['eax7_ecx'] = eax7_ecx f.close() else: print('File not found: ' + node) sys.exit(1) print_xl_masking_config(nodes) if args.verbose: print_verbose_masking_info(nodes) ++++++ xendomains-wait-disks.LICENSE ++++++ ++++ 674 lines (skipped) ++++++ xendomains-wait-disks.README.md ++++++ # xen-tools-xendomains-wait-disk [xendomains.service](https://github.com/xen-project/xen/blob/RELEASE-4.13.0/tools/hotplug/Linux/systemd/xendomains.service.in) has problems with disks that appear only later in boot process (or even after booting is complete). This project creates a service that loops over all disks that domU will use and wait for them to appear. xendomains-wait-disk.service launches a script that reads both /etc/xen/auto/ configurations and /var/lib/xen/save/ dumps. >From those files, it extracts which disks are needed for all domU that will be >started (respecting /etc/sysconfig/xendomains settings). After that, it simply loops waiting for those disks to appear. There is a timeout (5 min) configured in xendomains-wait-disk.service that prevents it to block booting process forever. There are two known cases where this project is useful: ## degraded mdadm RAID mdadm RAID are assembled by [udev rules](https://github.com/neilbrown/mdadm/blob/master/udev-md-raid-assembly.rules). However, it is only assembled when it is healthy. When a member is still missing, it starts a [timer](https://github.com/neilbrown/mdadm/blob/master/systemd/mdadm-last-resort%40.timer) that will try to assemble the RAID anyway after 30s, even if degraded. This timer does not block xendomains to be started. So, if a domU is depending on a MD RAID that is degraded (i.e. RAID 1 missing one disk), xendomains.service will be started before those 30s passed and that domU will fail. An alternative solution would be to add extra hard dependencies to xendomains.service for each required disk (Require=xxx.device). However, this solution introduces another bigger problem. Before, if a single RAID is degraded, only the domU that depends on it will fail. With Require=xxx.device, xendomains will never start if a RAID could not be assembled even after 30s (i.e. RAID5 with two missing disks). With xendomains-wait-disk.service, xendomains.service will be blocked up to 5 min waiting for those MD RAID used by domUs. If it fails, xendomains.service continues anyway. ## iSCSI disks domU that uses iSCSI disk (mapped by host OS) also fails to start during boot. open-iscsi.service returns before it connect to the remote target and rescan iscsi disks. As in mdadm RAID case, xendomains.service is started and domU that depends on iSCSI disks will fail. ++++++ xendomains-wait-disks.sh ++++++ #!/bin/bash # # Generates xendomains unit # read_conf_from_file() { ${sbindir}/xl create --quiet --dryrun --defconfig "$1" } big2littleendian_32bit(){ echo ${1:6:2}${1:4:2}${1:2:2}${1:0:2} } read_hex() { local out_var=$1; shift local input=$1; shift local pos_var=$1; shift local length=$1; shift local hex=$(dd bs=1 skip=${!pos_var} count=$length status=none <$input | xxd -p -c$length -l$length) read -r $pos_var <<<"$((${!pos_var} + $length))" read -r $out_var <<<"$hex" } hex2dec() { local hex=$1; shift local little_endian=$1; shift if $little_endian; then hex=$(big2littleendian_32bit $hex) fi echo $((0x$hex)) } read_conf_from_image(){ local pos=0 length=0 local magic_header byte_order mandatory_flags optional_flags optional_data_len config_len config_json read_hex magic_header $1 pos 32 # "Xen saved domain, xl format\n \0 \r" if [ "$magic_header" != "58656e20736176656420646f6d61696e2c20786c20666f726d61740a2000200d" ]; then log $err "Unknown file format in $1. Wrong magic header: '0x$magic_header'" return 1 fi read_hex byte_order $1 pos 4 case "$byte_order" in 04030201) little_endian=true;; 01020304) little_endian=false;; *) log $err "Unknown byte order 0x$byte_order in $1"; return 1;; esac #define XL_MANDATORY_FLAG_JSON (1U << 0) /* config data is in JSON format */ #define XL_MANDATORY_FLAG_STREAMv2 (1U << 1) /* stream is v2 */ read_hex mandatory_flags $1 pos 4 if [ "$(($(hex2dec $mandatory_flags $little_endian) & 0x3))" -ne 3 ]; then log $err "Unknown config format or stream version. Mandatory flags are 0x$mandatory_flag" return 1 fi read_hex optional_flags $1 pos 4 read_hex optional_data_len $1 pos 4 optional_data_len=$(hex2dec $optional_data_len $little_endian) # I'll not use but saved memory dump will begin at $((pos+optional_data_len)) read_hex config_len $1 pos 4 config_len=$(hex2dec $config_len $little_endian) # null terminated string read_hex config_json $1 pos $config_len xxd -p -r <<<"$config_json" } log() { local msg_loglevel=$1; shift if [ "$msg_loglevel" -gt "$LOGLEVEL" ]; then return 0 fi echo "$@" >&2 } emerg=0; alert=1; crit=2; err=3 warning=4; notice=5; info=6; debug=7 LOGLEVEL=${LOGLEVEL:-4} if [ "$SYSTEMD_LOG_LEVEL" ]; then LOGLEVEL=${!SYSTEMD_LOG_LEVEL} fi log $debug "Using loglevel $LOGLEVEL" trap "log $err Error on \$LINENO: \$(caller)" ERR log $debug "loading /etc/xen/scripts/hotplugpath.sh..." . /etc/xen/scripts/hotplugpath.sh #log $debug "testing for ${sbindir}/xl..." #CMD=${sbindir}/xl #if ! $CMD list &> /dev/null; then # log $err "${sbindir}/xl list failed!" # log $err "$($CMD list &>&1)" # exit $? #fi #log $debug "${sbindir}/xl list OK!" log $debug "loading /etc/sysconfig/xendomains..." XENDOM_CONFIG=/etc/sysconfig/xendomains if ! test -r $XENDOM_CONFIG; then echo "$XENDOM_CONFIG not existing" >&2; exit 6 fi . $XENDOM_CONFIG doms_conf=() doms_restore=() doms_source=() log $debug "Reading saved domains..." if [ "$XENDOMAINS_RESTORE" = "true" ] && [ -d "$XENDOMAINS_SAVE" ]; then for dom in $XENDOMAINS_SAVE/*; do log $debug "Trying $dom..." if ! [ -r $dom ] ; then log $debug "Not readable $dom..." continue fi log $debug "Reading conf from $dom..." if ! dom_conf=$(read_conf_from_image $dom); then log $error "Cannot read conf from $dom" continue fi log $debug "Adding $dom to the list" doms_conf+=("$dom_conf") doms_restore+=(true) doms_source+=("$dom") done fi log $debug "Reading auto domains..." if [ -d "$XENDOMAINS_AUTO" ]; then for dom in $XENDOMAINS_AUTO/*; do log $debug "Trying $dom..." if ! [ -r $dom ] ; then log $debug "Not readable $dom..." continue fi log $debug "Reading conf from $dom..." if ! dom_conf=$(read_conf_from_file $dom); then echo 123 log $error "Cannot read conf from $dom" continue fi log $debug "Adding $dom to the list" doms_conf+=("$dom_conf") doms_restore+=(false) doms_source+=("$dom") done fi log $debug "We have ${#doms_conf[*]} to check" for i in ${!doms_conf[*]}; do log $debug "Doing dom $i..." dom_conf="${doms_conf[i]}" dom_restore="${doms_restore[i]}" dom_source="${doms_source[i]}" dom_name=$(sed -n 's/^.*(name \(.*\))$/\1/p;s/^.*"name": "\(.*\)",$/\1/p' <<<"$dom_conf") readarray -t required_disks <<<"$(sed -n -e '/^ "disks": \[/,/ \],/{ /"pdev_path":/ { s/.*"pdev_path": "//;s/".*//p } }' <<<"$dom_conf")" log $debug "dom $i is named $dom_name..." for disk in "${required_disks[@]}"; do disk_control_var=control_$(tr -d -c '[a-zA-Z0-9_]' <<<"$disk") if [ "${!disk_control_var:-0}" -eq 1 ]; then log $debug "$disk for $dom_name is already being checked" continue fi declare $disk_control_var=1 log $debug "waiting for $disk for $dom_name" ( j=0 found_loglevel=$debug while true; do if [ -e "$disk" ]; then log $found_loglevel "disk $disk found (after $j seconds)" exit 0 fi if [ "$(( j++ % 5))" -eq 0 ]; then log $warning "still waiting for $disk for $dom_name..." found_loglevel=$warning fi sleep 1 done ) & done done wait log $debug "Exiting normally"