Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package xen for openSUSE:Factory checked in at 2025-02-03 21:41:44 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/xen (Old) and /work/SRC/openSUSE:Factory/.xen.new.2316 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "xen" Mon Feb 3 21:41:44 2025 rev:357 rq:1241755 version:4.20.0_06 Changes: -------- --- /work/SRC/openSUSE:Factory/xen/xen.changes 2025-01-22 16:31:32.746012058 +0100 +++ /work/SRC/openSUSE:Factory/.xen.new.2316/xen.changes 2025-02-03 21:42:14.471324932 +0100 @@ -1,0 +2,22 @@ +Fri Jan 31 09:59:45 MST 2025 - carn...@suse.com + +- Update to Xen 4.20.0 RC3 release + * x86/HVM: correct MMIO emulation cache bounds check + * x86/HVM: allocate emulation cache entries dynamically + * x86/HVM: correct read/write split at page boundaries + * x86/iommu: check for CMPXCHG16B when enabling IOMMU + * iommu/vtd: remove non-CX16 logic from interrupt remapping + * x86/iommu: remove non-CX16 logic from DMA remapping + * iommu/amd: atomically update IRTE + * x86emul: further correct 64-bit mode zero count repeated string + insn handling + * x86/PV: further harden guest memory accesses against speculative + abuse + * x86/intel: Fix PERF_GLOBAL fixup when virtualised + +------------------------------------------------------------------- +Fri Jan 31 08:49:14 UTC 2025 - Markéta Machová <mmach...@suse.com> + +- Add explicit build dependency on python3-setuptools, needed by python313 + +------------------------------------------------------------------- ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ xen.spec ++++++ --- /var/tmp/diff_new_pack.p9nuox/_old 2025-02-03 21:42:15.923385116 +0100 +++ /var/tmp/diff_new_pack.p9nuox/_new 2025-02-03 21:42:15.927385282 +0100 @@ -103,6 +103,7 @@ BuildRequires: ncurses-devel BuildRequires: openssl-devel BuildRequires: python3-devel +BuildRequires: python3-setuptools BuildRequires: xz-devel BuildRequires: pkgconfig(systemd) %ifarch x86_64 @@ -911,6 +912,10 @@ -name "s390*" -o \ -name "slof*" -o \ -name "spapr*" -o \ + -name "PKG-INFO" -o \ + -name "SOURCES.txt" -o \ + -name "dependency_links.txt" -o \ + -name "top_level.txt" -o \ -name "*.egg-info" \) \ -print -delete # Wipe empty directories ++++++ xen-4.20.0-testing-src.tar.bz2 ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/ChangeLog new/xen-4.20.0-testing/ChangeLog --- old/xen-4.20.0-testing/ChangeLog 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/ChangeLog 2025-01-31 17:59:06.000000000 +0100 @@ -1,18 +1,22 @@ -commit c3f5d1bb40b57d467cb4051eafa86f5933ec9003 -Author: Roger Pau Monne <roger....@citrix.com> -Date: Thu Jan 16 09:06:26 2025 +0100 +commit 45c65669bf34bfad9ff6de0dabae2cb201239e34 +Author: Michal Orzel <michal.or...@amd.com> +Date: Tue Jan 28 10:40:02 2025 +0100 - automation/cirrus-ci: introduce FreeBSD randconfig builds + xen/arm: Fix build issue when CONFIG_PHYS_ADDR_T_32=y - Add a new randconfig job for each FreeBSD version. This requires some - rework of the template so common parts can be shared between the full and - the randconfig builds. Such randconfig builds are relevant because FreeBSD - is the only tested system that has a full non-GNU toolchain. + On Arm32, when CONFIG_PHYS_ADDR_T_32 is set, a build failure is observed: + arch/arm/platforms/vexpress.c: In function 'vexpress_smp_init': + arch/arm/platforms/vexpress.c:102:12: error: format '%lx' expects argument of type 'long unsigned int', but argument 2 has type 'long long unsigned int' [-Werror=format=] + 102 | printk("Set SYS_FLAGS to %"PRIpaddr" (%p)\n", - While there replace the usage of the python311 package with python3, which is - already using 3.11, and remove the install of the plain python package for full - builds. + When CONFIG_PHYS_ADDR_T_32 is set, paddr_t is defined as unsigned long. + Commit 96f35de69e59 dropped __virt_to_maddr() which used paddr_t as a + return type. Without a cast, the expression type is unsigned long long + which causes the issue. Fix it. - Signed-off-by: Roger Pau Monné <roger....@citrix.com> - Reviewed-by: Andrew Cooper <andrew.coop...@citrix.com> - Release-Acked-by: Oleksii Kurochko<oleksii.kuroc...@gmail.com> + Fixes: 96f35de69e59 ("x86+Arm: drop (rename) __virt_to_maddr() / __maddr_to_virt()") + Signed-off-by: Michal Orzel <michal.or...@amd.com> + Release-Acked-by: Oleksii Kurochko <oleksii.kuroc...@gmail.com> + Reviewed-by: Luca Fancellu <luca.fance...@arm.com> + Tested-by: Luca Fancellu <luca.fance...@arm.com> + Reviewed-by: Stefano Stabellini <sstabell...@kernel.org> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/generic-timer.rst new/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/generic-timer.rst --- old/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/generic-timer.rst 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/generic-timer.rst 2025-01-31 17:59:06.000000000 +0100 @@ -21,7 +21,7 @@ Domains can detect the presence of the Generic Timer device tree node. Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Read system counter frequency ----------------------------- @@ -37,7 +37,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Access CNTKCTL_EL1 system register from a domain ------------------------------------------------ @@ -53,7 +53,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Access virtual timer from a domain ---------------------------------- @@ -69,7 +69,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Access physical timer from a domain ----------------------------------- @@ -85,7 +85,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Trigger the virtual timer interrupt from a domain ------------------------------------------------- @@ -101,7 +101,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Trigger the physical timer interrupt from a domain -------------------------------------------------- @@ -117,7 +117,7 @@ Comments: Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` Assumption of Use on the Platform ================================= @@ -139,7 +139,7 @@ dt property [2], the use of this property is strongly discouraged. Covers: - - `XenProd~emulated_timer~1` + - `XenProd~arm64_emulated_timer~1` [1] Arm Architecture Reference Manual for A-profile architecture, Chapter 11 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/timer/arm,arch_timer.yaml diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/sbsa-uart.rst new/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/sbsa-uart.rst --- old/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/sbsa-uart.rst 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/docs/fusa/reqs/design-reqs/arm64/sbsa-uart.rst 2025-01-31 17:59:06.000000000 +0100 @@ -21,7 +21,7 @@ Domains can detect the presence of the SBSA UART device tree node. Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Transmit data in software polling mode -------------------------------------- @@ -36,7 +36,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Transmit data in interrupt driven mode -------------------------------------- @@ -51,7 +51,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Receive data in software polling mode ------------------------------------- @@ -66,7 +66,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Receive data in interrupt driven mode ------------------------------------- @@ -81,7 +81,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART data register ------------------------- @@ -96,7 +96,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART receive status register ----------------------------------- @@ -111,7 +111,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART flag register ------------------------- @@ -126,7 +126,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART mask set/clear register ----------------------------------- @@ -141,7 +141,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART raw interrupt status register ----------------------------------------- @@ -156,7 +156,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART masked interrupt status register -------------------------------------------- @@ -171,7 +171,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Access UART interrupt clear register ------------------------------------ @@ -186,7 +186,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Receive UART TX interrupt ------------------------- @@ -202,7 +202,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` Receive UART RX interrupt reception ----------------------------------- @@ -218,7 +218,7 @@ Comments: Covers: - - `XenProd~emulated_uart~1` + - `XenProd~arm64_emulated_uart~1` [1] Arm Base System Architecture, chapter B -[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/serial/arm_sbsa_uart.txt \ No newline at end of file +[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/serial/arm_sbsa_uart.txt diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/arm/include/asm/mm.h new/xen-4.20.0-testing/xen/arch/arm/include/asm/mm.h --- old/xen-4.20.0-testing/xen/arch/arm/include/asm/mm.h 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/arm/include/asm/mm.h 2025-01-31 17:59:06.000000000 +0100 @@ -263,7 +263,7 @@ #define virt_to_maddr(va) ({ \ vaddr_t va_ = (vaddr_t)(va); \ - (va_to_par(va_) & PADDR_MASK & PAGE_MASK) | (va_ & ~PAGE_MASK); \ + (paddr_t)((va_to_par(va_) & PADDR_MASK & PAGE_MASK) | (va_ & ~PAGE_MASK)); \ }) #ifdef CONFIG_ARM_32 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/cpu/intel.c new/xen-4.20.0-testing/xen/arch/x86/cpu/intel.c --- old/xen-4.20.0-testing/xen/arch/x86/cpu/intel.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/cpu/intel.c 2025-01-31 17:59:06.000000000 +0100 @@ -535,39 +535,49 @@ printk("%u MHz\n", (factor * max_ratio + 50) / 100); } +static void init_intel_perf(struct cpuinfo_x86 *c) +{ + uint64_t val; + unsigned int eax, ver, nr_cnt; + + if ( c->cpuid_level <= 9 || + ({ rdmsrl(MSR_IA32_MISC_ENABLE, val); + !(val & MSR_IA32_MISC_ENABLE_PERF_AVAIL); }) ) + return; + + eax = cpuid_eax(10); + ver = eax & 0xff; + nr_cnt = (eax >> 8) & 0xff; + + if ( ver && nr_cnt > 1 && nr_cnt <= 32 ) + { + unsigned int cnt_mask = (1UL << nr_cnt) - 1; + + /* + * On (some?) Sapphire/Emerald Rapids platforms each package-BSP + * starts with all the enable bits for the general-purpose PMCs + * cleared. Adjust so counters can be enabled from EVNTSEL. + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, val); + + if ( (val & cnt_mask) != cnt_mask ) + { + printk("FIRMWARE BUG: CPU%u invalid PERF_GLOBAL_CTRL: %#"PRIx64" adjusting to %#"PRIx64"\n", + smp_processor_id(), val, val | cnt_mask); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, val | cnt_mask); + } + + __set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); + } +} + static void cf_check init_intel(struct cpuinfo_x86 *c) { /* Detect the extended topology information if available */ detect_extended_topology(c); init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - unsigned int cnt = (eax >> 8) & 0xff; - - /* Check for version and the number of counters */ - if ((eax & 0xff) && (cnt > 1) && (cnt <= 32)) { - uint64_t global_ctrl; - unsigned int cnt_mask = (1UL << cnt) - 1; - - /* - * On (some?) Sapphire/Emerald Rapids platforms each - * package-BSP starts with all the enable bits for the - * general-purpose PMCs cleared. Adjust so counters - * can be enabled from EVNTSEL. - */ - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_ctrl); - if ((global_ctrl & cnt_mask) != cnt_mask) { - printk("CPU%u: invalid PERF_GLOBAL_CTRL: %#" - PRIx64 " adjusting to %#" PRIx64 "\n", - smp_processor_id(), global_ctrl, - global_ctrl | cnt_mask); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, - global_ctrl | cnt_mask); - } - __set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); - } - } + init_intel_perf(c); if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/hvm/emulate.c new/xen-4.20.0-testing/xen/arch/x86/hvm/emulate.c --- old/xen-4.20.0-testing/xen/arch/x86/hvm/emulate.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/hvm/emulate.c 2025-01-31 17:59:06.000000000 +0100 @@ -26,6 +26,19 @@ #include <asm/iocap.h> #include <asm/vm_event.h> +/* + * We may read or write up to m512 or up to a tile row as a number of + * device-model transactions. + */ +struct hvm_mmio_cache { + unsigned long gla; /* Start of original access (e.g. insn operand). */ + unsigned int skip; /* Offset to start of MMIO */ + unsigned int size; /* Amount of buffer[] actually used, incl @skip. */ + unsigned int space:31; /* Allocated size of buffer[]. */ + unsigned int dir:1; + uint8_t buffer[] __aligned(sizeof(long)); +}; + struct hvmemul_cache { /* The cache is disabled as long as num_ents > max_ents. */ @@ -935,7 +948,14 @@ } /* Accesses must not overflow the cache's buffer. */ - if ( size > sizeof(cache->buffer) ) + if ( offset + size > cache->space ) + { + ASSERT_UNREACHABLE(); + return X86EMUL_UNHANDLEABLE; + } + + /* Accesses must not be to the unused leading space. */ + if ( offset < cache->skip ) { ASSERT_UNREACHABLE(); return X86EMUL_UNHANDLEABLE; @@ -998,27 +1018,33 @@ /* * Multi-cycle MMIO handling is based upon the assumption that emulation - * of the same instruction will not access the same MMIO region more - * than once. Hence we can deal with re-emulation (for secondary or - * subsequent cycles) by looking up the result or previous I/O in a - * cache indexed by linear MMIO address. + * of the same instruction will not access the exact same MMIO region + * more than once in exactly the same way (if it does, the accesses will + * be "folded"). Hence we can deal with re-emulation (for secondary or + * subsequent cycles) by looking up the result of previous I/O in a cache + * indexed by linear address and access type. */ static struct hvm_mmio_cache *hvmemul_find_mmio_cache( - struct hvm_vcpu_io *hvio, unsigned long gla, uint8_t dir, bool create) + struct hvm_vcpu_io *hvio, unsigned long gla, uint8_t dir, + unsigned int skip) { unsigned int i; struct hvm_mmio_cache *cache; for ( i = 0; i < hvio->mmio_cache_count; i ++ ) { - cache = &hvio->mmio_cache[i]; + cache = hvio->mmio_cache[i]; if ( gla == cache->gla && dir == cache->dir ) return cache; } - if ( !create ) + /* + * Bail if a new entry shouldn't be allocated, relying on ->space having + * the same value for all entries. + */ + if ( skip >= hvio->mmio_cache[0]->space ) return NULL; i = hvio->mmio_cache_count; @@ -1027,10 +1053,12 @@ ++hvio->mmio_cache_count; - cache = &hvio->mmio_cache[i]; - memset(cache, 0, sizeof (*cache)); + cache = hvio->mmio_cache[i]; + memset(cache->buffer, 0, cache->space); cache->gla = gla; + cache->skip = skip; + cache->size = skip; cache->dir = dir; return cache; @@ -1051,12 +1079,14 @@ static int hvmemul_linear_mmio_access( unsigned long gla, unsigned int size, uint8_t dir, void *buffer, - uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt, bool known_gpfn) + uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt, + unsigned long start_gla, bool known_gpfn) { struct hvm_vcpu_io *hvio = ¤t->arch.hvm.hvm_io; unsigned long offset = gla & ~PAGE_MASK; - struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(hvio, gla, dir, true); - unsigned int chunk, buffer_offset = 0; + unsigned int chunk, buffer_offset = gla - start_gla; + struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(hvio, start_gla, + dir, buffer_offset); paddr_t gpa; unsigned long one_rep = 1; int rc; @@ -1104,19 +1134,19 @@ static inline int hvmemul_linear_mmio_read( unsigned long gla, unsigned int size, void *buffer, uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt, - bool translate) + unsigned long start_gla, bool translate) { - return hvmemul_linear_mmio_access(gla, size, IOREQ_READ, buffer, - pfec, hvmemul_ctxt, translate); + return hvmemul_linear_mmio_access(gla, size, IOREQ_READ, buffer, pfec, + hvmemul_ctxt, start_gla, translate); } static inline int hvmemul_linear_mmio_write( unsigned long gla, unsigned int size, void *buffer, uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt, - bool translate) + unsigned long start_gla, bool translate) { - return hvmemul_linear_mmio_access(gla, size, IOREQ_WRITE, buffer, - pfec, hvmemul_ctxt, translate); + return hvmemul_linear_mmio_access(gla, size, IOREQ_WRITE, buffer, pfec, + hvmemul_ctxt, start_gla, translate); } static bool known_gla(unsigned long addr, unsigned int bytes, uint32_t pfec) @@ -1145,7 +1175,10 @@ { pagefault_info_t pfinfo; struct hvm_vcpu_io *hvio = ¤t->arch.hvm.hvm_io; + void *buffer = p_data; + unsigned long start = addr; unsigned int offset = addr & ~PAGE_MASK; + const struct hvm_mmio_cache *cache; int rc; if ( offset + bytes > PAGE_SIZE ) @@ -1169,8 +1202,17 @@ * an access that was previously handled as MMIO. Thus it is imperative that * we handle this access in the same way to guarantee completion and hence * clean up any interim state. + * + * Care must be taken, however, to correctly deal with crossing RAM/MMIO or + * MMIO/RAM boundaries. While we want to use a single cache entry (tagged + * by the starting linear address), we need to continue issuing (i.e. also + * upon replay) the RAM access for anything that's ahead of or past MMIO, + * i.e. in RAM. */ - if ( !hvmemul_find_mmio_cache(hvio, addr, IOREQ_READ, false) ) + cache = hvmemul_find_mmio_cache(hvio, start, IOREQ_READ, ~0); + if ( !cache || + addr + bytes <= start + cache->skip || + addr >= start + cache->size ) rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo); switch ( rc ) @@ -1186,8 +1228,8 @@ if ( pfec & PFEC_insn_fetch ) return X86EMUL_UNHANDLEABLE; - return hvmemul_linear_mmio_read(addr, bytes, p_data, pfec, - hvmemul_ctxt, + return hvmemul_linear_mmio_read(addr, bytes, buffer, pfec, + hvmemul_ctxt, start, known_gla(addr, bytes, pfec)); case HVMTRANS_gfn_paged_out: @@ -1204,7 +1246,10 @@ { pagefault_info_t pfinfo; struct hvm_vcpu_io *hvio = ¤t->arch.hvm.hvm_io; + void *buffer = p_data; + unsigned long start = addr; unsigned int offset = addr & ~PAGE_MASK; + const struct hvm_mmio_cache *cache; int rc; if ( offset + bytes > PAGE_SIZE ) @@ -1223,13 +1268,11 @@ rc = HVMTRANS_bad_gfn_to_mfn; - /* - * If there is an MMIO cache entry for the access then we must be re-issuing - * an access that was previously handled as MMIO. Thus it is imperative that - * we handle this access in the same way to guarantee completion and hence - * clean up any interim state. - */ - if ( !hvmemul_find_mmio_cache(hvio, addr, IOREQ_WRITE, false) ) + /* See commentary in linear_read(). */ + cache = hvmemul_find_mmio_cache(hvio, start, IOREQ_WRITE, ~0); + if ( !cache || + addr + bytes <= start + cache->skip || + addr >= start + cache->size ) rc = hvm_copy_to_guest_linear(addr, p_data, bytes, pfec, &pfinfo); switch ( rc ) @@ -1242,8 +1285,8 @@ return X86EMUL_EXCEPTION; case HVMTRANS_bad_gfn_to_mfn: - return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec, - hvmemul_ctxt, + return hvmemul_linear_mmio_write(addr, bytes, buffer, pfec, + hvmemul_ctxt, start, known_gla(addr, bytes, pfec)); case HVMTRANS_gfn_paged_out: @@ -1630,7 +1673,7 @@ { /* Fix this in case the guest is really relying on r-m-w atomicity. */ return hvmemul_linear_mmio_write(addr, bytes, p_new, pfec, - hvmemul_ctxt, + hvmemul_ctxt, addr, hvio->mmio_access.write_access && hvio->mmio_gla == (addr & PAGE_MASK)); } @@ -2980,16 +3023,21 @@ int hvmemul_cache_init(struct vcpu *v) { /* - * No insn can access more than 16 independent linear addresses (AVX512F - * scatters/gathers being the worst). Each such linear range can span a - * page boundary, i.e. may require two page walks. Account for each insn - * byte individually, for simplicity. + * AVX512F scatter/gather insns can access up to 16 independent linear + * addresses, up to 8 bytes size. Each such linear range can span a page + * boundary, i.e. may require two page walks. */ - const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) * - (MAX_INST_LEN + 16 * 2); - struct hvmemul_cache *cache = xmalloc_flex_struct(struct hvmemul_cache, - ents, nents); + unsigned int nents = 16 * 2 * (CONFIG_PAGING_LEVELS + 1); + unsigned int i, max_bytes = 64; + struct hvmemul_cache *cache; + /* + * Account for each insn byte individually, both for simplicity and to + * leave some slack space. + */ + nents += MAX_INST_LEN * (CONFIG_PAGING_LEVELS + 1); + + cache = xvmalloc_flex_struct(struct hvmemul_cache, ents, nents); if ( !cache ) return -ENOMEM; @@ -2999,6 +3047,15 @@ v->arch.hvm.hvm_io.cache = cache; + for ( i = 0; i < ARRAY_SIZE(v->arch.hvm.hvm_io.mmio_cache); ++i ) + { + v->arch.hvm.hvm_io.mmio_cache[i] = + xvmalloc_flex_struct(struct hvm_mmio_cache, buffer, max_bytes); + if ( !v->arch.hvm.hvm_io.mmio_cache[i] ) + return -ENOMEM; + v->arch.hvm.hvm_io.mmio_cache[i]->space = max_bytes; + } + return 0; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/include/asm/asm-defns.h new/xen-4.20.0-testing/xen/arch/x86/include/asm/asm-defns.h --- old/xen-4.20.0-testing/xen/arch/x86/include/asm/asm-defns.h 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/include/asm/asm-defns.h 2025-01-31 17:59:06.000000000 +0100 @@ -1,3 +1,5 @@ +#include <asm/page-bits.h> + #ifndef HAVE_AS_CLAC_STAC .macro clac .byte 0x0f, 0x01, 0xca @@ -65,17 +67,36 @@ .macro guest_access_mask_ptr ptr:req, scratch1:req, scratch2:req #if defined(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) /* - * Here we want - * - * ptr &= ~0ull >> (ptr < HYPERVISOR_VIRT_END); - * + * Here we want to adjust \ptr such that + * - if it's within Xen range, it becomes non-canonical, + * - otherwise if it's (non-)canonical on input, it retains that property, + * - if the result is non-canonical, bit 47 is clear (to avoid + * potentially populating the cache with Xen data on AMD-like hardware), * but guaranteed without any conditional branches (hence in assembly). + * + * To achieve this we determine which bit to forcibly clear: Either bit 47 + * (in case the address is below HYPERVISOR_VIRT_END) or bit 63. Further + * we determine whether for forcably set bit 63: In case we first cleared + * it, we'll merely restore the original address. In case we ended up + * clearing bit 47 (i.e. the address was either non-canonical or within Xen + * range), setting the bit will yield a guaranteed non-canonical address. + * If we didn't clear a bit, we also won't set one: The address was in the + * low half of address space in that case with bit 47 already clear. The + * address can thus be left unchanged, whether canonical or not. */ mov $(HYPERVISOR_VIRT_END - 1), \scratch1 - mov $~0, \scratch2 + mov $(VADDR_BITS - 1), \scratch2 cmp \ptr, \scratch1 + /* + * Not needed: The value we have in \scratch1 will be truncated to 6 bits, + * thus yielding the value we need. + mov $63, \scratch1 + */ + cmovnb \scratch2, \scratch1 + xor \scratch2, \scratch2 + btr \scratch1, \ptr rcr $1, \scratch2 - and \scratch2, \ptr + or \scratch2, \ptr #elif defined(CONFIG_DEBUG) && defined(CONFIG_PV) xor $~\@, \scratch1 xor $~\@, \scratch2 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/emulate.h new/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/emulate.h --- old/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/emulate.h 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/emulate.h 2025-01-31 17:59:06.000000000 +0100 @@ -15,6 +15,7 @@ #include <xen/err.h> #include <xen/mm.h> #include <xen/sched.h> +#include <xen/xvmalloc.h> #include <asm/hvm/hvm.h> #include <asm/x86_emulate.h> @@ -119,7 +120,11 @@ int __must_check hvmemul_cache_init(struct vcpu *v); static inline void hvmemul_cache_destroy(struct vcpu *v) { - XFREE(v->arch.hvm.hvm_io.cache); + unsigned int i; + + for ( i = 0; i < ARRAY_SIZE(v->arch.hvm.hvm_io.mmio_cache); ++i ) + XFREE(v->arch.hvm.hvm_io.mmio_cache[i]); + XVFREE(v->arch.hvm.hvm_io.cache); } bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa, void *buffer, unsigned int size); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/vcpu.h new/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/vcpu.h --- old/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/vcpu.h 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/include/asm/hvm/vcpu.h 2025-01-31 17:59:06.000000000 +0100 @@ -22,17 +22,6 @@ uint32_t asid; }; -/* - * We may read or write up to m512 as a number of device-model - * transactions. - */ -struct hvm_mmio_cache { - unsigned long gla; - unsigned int size; - uint8_t dir; - uint8_t buffer[64] __aligned(sizeof(long)); -}; - struct hvm_vcpu_io { /* * HVM emulation: @@ -48,7 +37,7 @@ * We may need to handle up to 3 distinct memory accesses per * instruction. */ - struct hvm_mmio_cache mmio_cache[3]; + struct hvm_mmio_cache *mmio_cache[3]; unsigned int mmio_cache_count; /* For retries we shouldn't re-fetch the instruction. */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/arch/x86/x86_emulate/x86_emulate.c new/xen-4.20.0-testing/xen/arch/x86/x86_emulate/x86_emulate.c --- old/xen-4.20.0-testing/xen/arch/x86/x86_emulate/x86_emulate.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/arch/x86/x86_emulate/x86_emulate.c 2025-01-31 17:59:06.000000000 +0100 @@ -513,7 +513,7 @@ regs->r(cx) = ad_bytes == 4 ? (uint32_t)count : count; } -#define get_rep_prefix(using_si, using_di) ({ \ +#define get_rep_prefix(extend_si, extend_di) ({ \ unsigned long max_reps = 1; \ if ( rep_prefix() ) \ max_reps = get_loop_count(&_regs, ad_bytes); \ @@ -521,14 +521,14 @@ { \ /* \ * Skip the instruction if no repetitions are required, but \ - * zero extend involved registers first when using 32-bit \ + * zero extend relevant registers first when using 32-bit \ * addressing in 64-bit mode. \ */ \ - if ( mode_64bit() && ad_bytes == 4 ) \ + if ( !amd_like(ctxt) && mode_64bit() && ad_bytes == 4 ) \ { \ _regs.r(cx) = 0; \ - if ( using_si ) _regs.r(si) = (uint32_t)_regs.r(si); \ - if ( using_di ) _regs.r(di) = (uint32_t)_regs.r(di); \ + if ( extend_si ) _regs.r(si) = _regs.esi; \ + if ( extend_di ) _regs.r(di) = _regs.edi; \ } \ goto complete_insn; \ } \ @@ -1818,7 +1818,7 @@ dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - nr_reps = get_rep_prefix(false, true); + nr_reps = get_rep_prefix(false, false /* don't extend RSI/RDI */); dst.mem.off = truncate_ea_and_reps(_regs.r(di), nr_reps, dst.bytes); dst.mem.seg = x86_seg_es; /* Try the presumably most efficient approach first. */ @@ -1860,7 +1860,7 @@ dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - nr_reps = get_rep_prefix(true, false); + nr_reps = get_rep_prefix(false, false /* don't extend RSI/RDI */); ea.mem.off = truncate_ea_and_reps(_regs.r(si), nr_reps, dst.bytes); /* Try the presumably most efficient approach first. */ if ( !ops->rep_outs ) @@ -2198,7 +2198,7 @@ case 0xa6 ... 0xa7: /* cmps */ { unsigned long next_eip = _regs.r(ip); - get_rep_prefix(true, true); + get_rep_prefix(false, false /* don't extend RSI/RDI */); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.r(si)), &dst.val, dst.bytes, ctxt, ops)) || @@ -2240,7 +2240,7 @@ } case 0xac ... 0xad: /* lods */ - get_rep_prefix(true, false); + get_rep_prefix(false, false /* don't extend RSI/RDI */); if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.r(si)), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; @@ -2251,7 +2251,7 @@ case 0xae ... 0xaf: /* scas */ { unsigned long next_eip = _regs.r(ip); - get_rep_prefix(false, true); + get_rep_prefix(false, false /* don't extend RSI/RDI */); if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.r(di)), &dst.val, src.bytes, ctxt, ops)) != 0 ) goto done; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/common/device-tree/bootfdt.c new/xen-4.20.0-testing/xen/common/device-tree/bootfdt.c --- old/xen-4.20.0-testing/xen/common/device-tree/bootfdt.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/common/device-tree/bootfdt.c 2025-01-31 17:59:06.000000000 +0100 @@ -27,8 +27,8 @@ */ BUILD_BUG_ON((offsetof(struct membanks, bank) != offsetof(struct meminfo, bank))); - /* Ensure "struct membanks" is 8-byte aligned */ - BUILD_BUG_ON(alignof(struct membanks) != 8); + /* Ensure "struct membanks" and "struct membank" are equally aligned */ + BUILD_BUG_ON(alignof(struct membanks) != alignof(struct membank)); } static bool __init device_tree_node_is_available(const void *fdt, int node) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_intr.c new/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_intr.c --- old/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_intr.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_intr.c 2025-01-31 17:59:06.000000000 +0100 @@ -39,7 +39,8 @@ }; union irte128 { - uint64_t raw[2]; + uint64_t raw64[2]; + __uint128_t raw128; struct { bool remap_en:1; bool sup_io_pf:1; @@ -187,7 +188,7 @@ if ( iommu->ctrl.ga_en ) { - ACCESS_ONCE(entry.ptr128->raw[0]) = 0; + ACCESS_ONCE(entry.ptr128->raw64[0]) = 0; /* * Low half (containing RemapEn) needs to be cleared first. Note that * strictly speaking smp_wmb() isn't enough, as conceptually it expands @@ -197,7 +198,7 @@ * variant will do. */ smp_wmb(); - entry.ptr128->raw[1] = 0; + entry.ptr128->raw64[1] = 0; } else ACCESS_ONCE(entry.ptr32->raw) = 0; @@ -212,7 +213,7 @@ { if ( iommu->ctrl.ga_en ) { - union irte128 irte = { + const union irte128 irte = { .full = { .remap_en = true, .int_type = int_type, @@ -222,19 +223,26 @@ .vector = vector, }, }; + __uint128_t old = entry.ptr128->raw128; + __uint128_t res = cmpxchg16b(&entry.ptr128->raw128, &old, + &irte.raw128); - ASSERT(!entry.ptr128->full.remap_en); - entry.ptr128->raw[1] = irte.raw[1]; /* - * High half needs to be set before low one (containing RemapEn). See - * comment in free_intremap_entry() regarding the choice of barrier. + * Hardware does not update the IRTE behind our backs, so the return + * value should match "old". */ - smp_wmb(); - ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0]; + if ( res != old ) + { + printk(XENLOG_ERR + "unexpected IRTE %016lx_%016lx (expected %016lx_%016lx)\n", + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + ASSERT_UNREACHABLE(); + } } else { - union irte32 irte = { + const union irte32 irte = { .flds = { .remap_en = true, .int_type = int_type, @@ -299,21 +307,13 @@ entry = get_intremap_entry(iommu, req_id, offset); - /* The RemapEn fields match for all formats. */ - while ( iommu->enabled && entry.ptr32->flds.remap_en ) - { - entry.ptr32->flds.remap_en = false; - spin_unlock(lock); - - amd_iommu_flush_intremap(iommu, req_id); - - spin_lock(lock); - } - update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); + if ( !fresh ) + amd_iommu_flush_intremap(iommu, req_id); + set_rte_index(rte, offset); return 0; @@ -322,7 +322,7 @@ void cf_check amd_iommu_ioapic_update_ire( unsigned int apic, unsigned int pin, uint64_t rte) { - struct IO_APIC_route_entry old_rte, new_rte; + struct IO_APIC_route_entry new_rte; int seg, bdf, rc; struct amd_iommu *iommu; unsigned int idx; @@ -346,14 +346,6 @@ return; } - old_rte = __ioapic_read_entry(apic, pin, true); - /* mask the interrupt while we change the intremap table */ - if ( !old_rte.mask ) - { - old_rte.mask = 1; - __ioapic_write_entry(apic, pin, true, old_rte); - } - /* Update interrupt remapping entry */ rc = update_intremap_entry_from_ioapic( bdf, iommu, &new_rte, @@ -425,6 +417,7 @@ uint8_t delivery_mode, vector, dest_mode; spinlock_t *lock; unsigned int dest, offset, i; + bool fresh = false; req_id = get_dma_requestor_id(iommu->seg, bdf); alias_id = get_intremap_requestor_id(iommu->seg, bdf); @@ -468,26 +461,21 @@ return -ENOSPC; } *remap_index = offset; + fresh = true; } entry = get_intremap_entry(iommu, req_id, offset); - /* The RemapEn fields match for all formats. */ - while ( iommu->enabled && entry.ptr32->flds.remap_en ) - { - entry.ptr32->flds.remap_en = false; - spin_unlock(lock); + update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest); + spin_unlock_irqrestore(lock, flags); + if ( !fresh ) + { amd_iommu_flush_intremap(iommu, req_id); if ( alias_id != req_id ) amd_iommu_flush_intremap(iommu, alias_id); - - spin_lock(lock); } - update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest); - spin_unlock_irqrestore(lock, flags); - *data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset; /* @@ -649,6 +637,19 @@ if ( !iommu_enable || !iommu_intremap ) return false; + if ( unlikely(!cpu_has_cx16) ) + { + AMD_IOMMU_ERROR("no CMPXCHG16B support, disabling IOMMU\n"); + /* + * Disable IOMMU support at once: there's no reason to check for CX16 + * yet again when attempting to initialize IOMMU DMA remapping + * functionality or interrupt remapping without x2APIC support. + */ + iommu_enable = false; + iommu_intremap = iommu_intremap_off; + return false; + } + if ( amd_iommu_prepare(true) ) return false; @@ -722,7 +723,7 @@ for ( count = 0; count < nr; count++ ) { if ( iommu->ctrl.ga_en - ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1] + ? !tbl.ptr128[count].raw64[0] && !tbl.ptr128[count].raw64[1] : !tbl.ptr32[count].raw ) continue; @@ -735,7 +736,8 @@ if ( iommu->ctrl.ga_en ) printk(" IRTE[%03x] %016lx_%016lx\n", - count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]); + count, tbl.ptr128[count].raw64[1], + tbl.ptr128[count].raw64[0]); else printk(" IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_map.c new/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_map.c --- old/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_map.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/amd/iommu_map.c 2025-01-31 17:59:06.000000000 +0100 @@ -167,15 +167,14 @@ { bool valid = flags & SET_ROOT_VALID; - if ( dte->v && dte->tv && - (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) ) + if ( dte->v && dte->tv ) { union { struct amd_iommu_dte dte; uint64_t raw64[4]; __uint128_t raw128[2]; } ldte = { .dte = *dte }; - __uint128_t old = ldte.raw128[0]; + __uint128_t res, old = ldte.raw128[0]; int ret = 0; ldte.dte.domain_id = domain_id; @@ -185,33 +184,20 @@ ldte.dte.paging_mode = paging_mode; ldte.dte.v = valid; - if ( cpu_has_cx16 ) - { - __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]); + res = cmpxchg16b(dte, &old, &ldte.raw128[0]); - /* - * Hardware does not update the DTE behind our backs, so the - * return value should match "old". - */ - if ( res != old ) - { - printk(XENLOG_ERR - "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n", - domain_id, - (uint64_t)(res >> 64), (uint64_t)res, - (uint64_t)(old >> 64), (uint64_t)old); - ret = -EILSEQ; - } - } - else /* Best effort, updating domain_id last. */ + /* + * Hardware does not update the DTE behind our backs, so the + * return value should match "old". + */ + if ( res != old ) { - uint64_t *ptr = (void *)dte; - - write_atomic(ptr + 0, ldte.raw64[0]); - /* No barrier should be needed between these two. */ - write_atomic(ptr + 1, ldte.raw64[1]); - - ret = 1; + printk(XENLOG_ERR + "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n", + domain_id, + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + ret = -EILSEQ; } return ret; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c new/xen-4.20.0-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c --- old/xen-4.20.0-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/amd/pci_amd_iommu.c 2025-01-31 17:59:06.000000000 +0100 @@ -309,6 +309,12 @@ if ( !iommu_enable && !iommu_intremap ) return 0; + if ( unlikely(!cpu_has_cx16) ) + { + AMD_IOMMU_ERROR("no CMPXCHG16B support, disabling IOMMU\n"); + return -ENODEV; + } + if ( (init_done ? amd_iommu_init_late() : amd_iommu_init(false)) != 0 ) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/intremap.c new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/intremap.c --- old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/intremap.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/intremap.c 2025-01-31 17:59:06.000000000 +0100 @@ -150,6 +150,19 @@ if ( !iommu_qinval || !iommu_intremap || list_empty(&acpi_drhd_units) ) return false; + if ( unlikely(!cpu_has_cx16) ) + { + printk(XENLOG_ERR VTDPREFIX "no CMPXCHG16B support, disabling IOMMU\n"); + /* + * Disable IOMMU support at once: there's no reason to check for CX16 + * yet again when attempting to initialize IOMMU DMA remapping + * functionality or interrupt remapping without x2APIC support. + */ + iommu_enable = false; + iommu_intremap = iommu_intremap_off; + return false; + } + /* We MUST have a DRHD unit for each IOAPIC. */ for ( apic = 0; apic < nr_ioapics; apic++ ) if ( !ioapic_to_drhd(IO_APIC_ID(apic)) ) @@ -171,49 +184,26 @@ /* * Assume iremap_lock has been acquired. It is to make sure software will not - * change the same IRTE behind us. With this assumption, if only high qword or - * low qword in IRTE is to be updated, this function's atomic variant can - * present an atomic update to VT-d hardware even when cmpxchg16b - * instruction is not supported. + * change the same IRTE behind us. */ static void update_irte(struct vtd_iommu *iommu, struct iremap_entry *entry, const struct iremap_entry *new_ire, bool atomic) { - ASSERT(spin_is_locked(&iommu->intremap.lock)); + __uint128_t ret; + struct iremap_entry old_ire; - if ( cpu_has_cx16 ) - { - __uint128_t ret; - struct iremap_entry old_ire; + ASSERT(spin_is_locked(&iommu->intremap.lock)); - old_ire = *entry; - ret = cmpxchg16b(entry, &old_ire, new_ire); + old_ire = *entry; + ret = cmpxchg16b(entry, &old_ire, new_ire); - /* - * In the above, we use cmpxchg16 to atomically update the 128-bit - * IRTE, and the hardware cannot update the IRTE behind us, so - * the return value of cmpxchg16 should be the same as old_ire. - * This ASSERT validate it. - */ - ASSERT(ret == old_ire.val); - } - else - { - /* - * VT-d hardware doesn't update IRTEs behind us, nor the software - * since we hold iremap_lock. If the caller wants VT-d hardware to - * always see a consistent entry, but we can't meet it, a bug will - * be raised. - */ - if ( entry->lo == new_ire->lo ) - write_atomic(&entry->hi, new_ire->hi); - else if ( entry->hi == new_ire->hi ) - write_atomic(&entry->lo, new_ire->lo); - else if ( !atomic ) - *entry = *new_ire; - else - BUG(); - } + /* + * In the above, we use cmpxchg16 to atomically update the 128-bit + * IRTE, and the hardware cannot update the IRTE behind us, so + * the return value of cmpxchg16 should be the same as old_ire. + * This ASSERT validate it. + */ + ASSERT(ret == old_ire.val); } /* Mark specified intr remap entry as free */ @@ -395,7 +385,6 @@ /* Indicate remap format. */ remap_rte->format = 1; - /* If cmpxchg16b is not available the caller must mask the IO-APIC pin. */ update_irte(iommu, iremap_entry, &new_ire, !init && !masked); iommu_sync_cache(iremap_entry, sizeof(*iremap_entry)); iommu_flush_iec_index(iommu, 0, index); @@ -434,38 +423,15 @@ { struct IO_xAPIC_route_entry old_rte = {}, new_rte; struct vtd_iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); - bool masked = true; int rc; - if ( !cpu_has_cx16 ) - { - /* - * Cannot atomically update the IRTE entry: mask the IO-APIC pin to - * avoid interrupts seeing an inconsistent IRTE entry. - */ - old_rte = __ioapic_read_entry(apic, pin, true); - if ( !old_rte.mask ) - { - masked = false; - old_rte.mask = 1; - __ioapic_write_entry(apic, pin, true, old_rte); - } - } - /* Not the initializer, for old gcc to cope. */ new_rte.raw = rte; rc = ioapic_rte_to_remap_entry(iommu, apic, pin, &old_rte, new_rte); if ( rc ) - { - if ( !masked ) - { - /* Recover the original value of 'mask' bit */ - old_rte.mask = 0; - __ioapic_write_entry(apic, pin, true, old_rte); - } return; - } + /* old_rte will contain the updated IO-APIC RTE on success. */ __ioapic_write_entry(apic, pin, true, old_rte); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/iommu.c new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/iommu.c --- old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/iommu.c 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/iommu.c 2025-01-31 17:59:06.000000000 +0100 @@ -1485,7 +1485,7 @@ { struct domain_iommu *hd = dom_iommu(domain); struct context_entry *context, *context_entries, lctxt; - __uint128_t old; + __uint128_t res, old; uint64_t maddr; uint16_t seg = iommu->drhd->segment, prev_did = 0; struct domain *prev_dom = NULL; @@ -1583,55 +1583,23 @@ ASSERT(!context_fault_disable(lctxt)); } - if ( cpu_has_cx16 ) - { - __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); + res = cmpxchg16b(context, &old, &lctxt.full); - /* - * Hardware does not update the context entry behind our backs, - * so the return value should match "old". - */ - if ( res != old ) - { - if ( pdev ) - check_cleanup_domid_map(domain, pdev, iommu); - printk(XENLOG_ERR - "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", - &PCI_SBDF(seg, bus, devfn), - (uint64_t)(res >> 64), (uint64_t)res, - (uint64_t)(old >> 64), (uint64_t)old); - rc = -EILSEQ; - goto unlock; - } - } - else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) + /* + * Hardware does not update the context entry behind our backs, + * so the return value should match "old". + */ + if ( res != old ) { - context_clear_present(*context); - iommu_sync_cache(context, sizeof(*context)); - - write_atomic(&context->hi, lctxt.hi); - /* No barrier should be needed between these two. */ - write_atomic(&context->lo, lctxt.lo); - } - else /* Best effort, updating DID last. */ - { - /* - * By non-atomically updating the context entry's DID field last, - * during a short window in time TLB entries with the old domain ID - * but the new page tables may be inserted. This could affect I/O - * of other devices using this same (old) domain ID. Such updating - * therefore is not a problem if this was the only device associated - * with the old domain ID. Diverting I/O of any of a dying domain's - * devices to the quarantine page tables is intended anyway. - */ - if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) - printk(XENLOG_WARNING VTDPREFIX - " %pp: reassignment may cause %pd data corruption\n", - &PCI_SBDF(seg, bus, devfn), prev_dom); - - write_atomic(&context->lo, lctxt.lo); - /* No barrier should be needed between these two. */ - write_atomic(&context->hi, lctxt.hi); + if ( pdev ) + check_cleanup_domid_map(domain, pdev, iommu); + printk(XENLOG_ERR + "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", + &PCI_SBDF(seg, bus, devfn), + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + rc = -EILSEQ; + goto unlock; } iommu_sync_cache(context, sizeof(struct context_entry)); @@ -1727,15 +1695,9 @@ break; } - if ( domain != pdev->domain && pdev->domain != dom_io ) - { - if ( pdev->domain->is_dying ) - mode |= MAP_OWNER_DYING; - else if ( drhd && - !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && - !pdev->phantom_stride ) - mode |= MAP_SINGLE_DEVICE; - } + if ( domain != pdev->domain && pdev->domain != dom_io && + pdev->domain->is_dying ) + mode |= MAP_OWNER_DYING; switch ( pdev->type ) { @@ -2633,6 +2595,13 @@ int ret; bool reg_inval_supported = true; + if ( unlikely(!cpu_has_cx16) ) + { + printk(XENLOG_ERR VTDPREFIX "no CMPXCHG16B support, disabling IOMMU\n"); + ret = -ENODEV; + goto error; + } + if ( list_empty(&acpi_drhd_units) ) { ret = -ENODEV; @@ -2695,12 +2664,7 @@ iommu_intremap = iommu_intremap_off; #ifndef iommu_intpost - /* - * We cannot use posted interrupt if X86_FEATURE_CX16 is - * not supported, since we count on this feature to - * atomically update 16-byte IRTE in posted format. - */ - if ( !cap_intr_post(iommu->cap) || !iommu_intremap || !cpu_has_cx16 ) + if ( !cap_intr_post(iommu->cap) || !iommu_intremap ) iommu_intpost = false; #endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/vtd.h new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/vtd.h --- old/xen-4.20.0-testing/xen/drivers/passthrough/vtd/vtd.h 2025-01-20 13:45:27.000000000 +0100 +++ new/xen-4.20.0-testing/xen/drivers/passthrough/vtd/vtd.h 2025-01-31 17:59:06.000000000 +0100 @@ -28,9 +28,8 @@ */ #define MAP_WITH_RMRR (1u << 0) #define MAP_OWNER_DYING (1u << 1) -#define MAP_SINGLE_DEVICE (1u << 2) -#define MAP_ERROR_RECOVERY (1u << 3) -#define UNMAP_ME_PHANTOM_FUNC (1u << 4) +#define MAP_ERROR_RECOVERY (1u << 2) +#define UNMAP_ME_PHANTOM_FUNC (1u << 3) /* Allow for both IOAPIC and IOSAPIC. */ #define IO_xAPIC_route_entry IO_APIC_route_entry